Gemmi C++ API
Loading...
Searching...
No Matches
cif.hpp
Go to the documentation of this file.
1// Copyright 2017 Global Phasing Ltd.
2//
3// CIF parser (based on PEGTL) with pluggable actions,
4// and a set of actions that prepare Document.
5
6#ifndef GEMMI_CIF_HPP_
7#define GEMMI_CIF_HPP_
8#include <cassert>
9#include <cstdio> // for FILE
10#include <iosfwd> // for size_t, istream
11#include <string>
12
13#include "third_party/tao/pegtl.hpp"
14//#include "third_party/tao/pegtl/contrib/tracer.hpp" // for debugging
15
16#include "cifdoc.hpp" // for Document, etc
17#include "input.hpp" // for CharArray
18#if defined(_WIN32)
19#include "fileutil.hpp" // for file_open
20#endif
21
22#if defined(_MSC_VER)
23#pragma warning(push)
24// warning C4244: an integer type is converted to a smaller integer type
25#pragma warning(disable: 4244)
26// warning C4267: conversion from 'size_t' to 'type', possible loss of data
27#pragma warning(disable: 4267)
28#endif
29
30namespace gemmi {
31namespace cif {
32using std::size_t;
33namespace pegtl = tao::pegtl;
34
35
36// **** grammar rules, named similarly as in the CIF 1.1 spec ****
37namespace rules {
38
39 template<int TableVal> struct lookup_char {
40 using analyze_t = pegtl::analysis::generic<pegtl::analysis::rule_type::ANY>;
41 template<typename Input> static bool match(Input& in) {
42 if (!in.empty() && cif::char_table(in.peek_char()) == TableVal) {
43 if (TableVal == 2) // this set includes new-line
44 in.bump(1);
45 else
46 in.bump_in_this_line(1);
47 return true;
48 }
49 return false;
50 }
51 };
52
53 // (letter) refers to sections in Table 2.2.7.1 in Vol.G of ITfC (2006).
54
55 // (g) Character sets.
56 // OrdinaryCharacter: ! % & ()*+,-./0-9: <=>?@A-Z[] \ ^ `a-z{|}~
57 using ordinary_char = lookup_char<1>;
58
59 using ws_char = lookup_char<2>;
60
61 // !"#$%&'()*+,-./0-9:;<=>?@A-Z[\]^_`a-z{|}~
62 struct nonblank_ch : pegtl::range<'!', '~'> {};
63
64 // ascii space is just before '!'
65 struct anyprint_ch : pegtl::ranges<' ', '~', '\t'> {};
66
67
68 // (f) White space and comments.
69 struct comment : pegtl::if_must<pegtl::one<'#'>, pegtl::until<pegtl::eolf>>{};
70 struct whitespace : pegtl::plus<pegtl::sor<ws_char, comment>> {};
71 struct ws_or_eof : pegtl::sor<whitespace, pegtl::eof> {};
72
73 // (b) Reserved words.
74 struct str_data : TAOCPP_PEGTL_ISTRING("data_") {};
75 struct str_loop : TAOCPP_PEGTL_ISTRING("loop_") {};
76 struct str_global : TAOCPP_PEGTL_ISTRING("global_") {};
77 struct str_save : TAOCPP_PEGTL_ISTRING("save_") {};
78 struct str_stop : TAOCPP_PEGTL_ISTRING("stop_") {};
79 struct keyword : pegtl::sor<str_data, str_loop, str_global,
80 str_save, str_stop> {};
81
82 // (e) Character strings and text fields.
83 template<typename Q>
84 struct endq : pegtl::seq<Q, pegtl::at<pegtl::sor<
85 pegtl::one<' ','\n','\r','\t','#'>,
86 pegtl::eof>>> {};
87 // strict rule would be:
88 // template <typename Q> struct quoted_tail : until<endq<Q>, anyprint_ch> {};
89 // but it was relaxed after PDB accepted 5q1h with non-ascii character
90 template<typename Q>
91 struct quoted_tail : pegtl::until<endq<Q>, pegtl::not_one<'\n'>> {};
92 template<typename Q>
93 struct quoted : pegtl::if_must<Q, quoted_tail<Q>> {};
94 struct singlequoted : quoted<pegtl::one<'\''>> {};
95 struct doublequoted : quoted<pegtl::one<'"'>> {};
96 struct field_sep : pegtl::seq<pegtl::bol, pegtl::one<';'>> {};
97 // CIF 2.0 requires whitespace after text field, so it'd be:
98 // until<endq<field_sep>> instead of until<field_sep>.
99 struct textfield : pegtl::if_must<field_sep, pegtl::until<field_sep>> {};
100 struct unquoted : pegtl::seq<pegtl::not_at<keyword>,
101 pegtl::not_at<pegtl::one<'_','$','#'>>,
102 pegtl::plus<nonblank_ch>> {};
103
104 // (a) Basic structure of CIF. (c) Tags and values.
105 // datablockname in STAR/CIF should not be empty, but we made an exception
106 // for RELION which writes blocks starting with bare data_
107 struct datablockname : pegtl::star<nonblank_ch> {};
108 struct datablockheading : pegtl::sor<pegtl::seq<str_data, datablockname>, str_global> {};
109 struct tag : pegtl::seq<pegtl::one<'_'>, pegtl::plus<nonblank_ch>> {};
110 // unquoted value made of ordinary characters only - for a typical mmCIF file
111 // it is faster to check it first even if we backtrack on some values_.
112 struct simunq : pegtl::seq<pegtl::plus<ordinary_char>, pegtl::at<ws_char>> {};
113 struct value : pegtl::sor<simunq, singlequoted, doublequoted,
114 textfield, unquoted> {};
115 struct item_tag : tag {};
116 struct item_value : value {};
117 struct loop_tag : tag {};
118 struct loop_value : value {};
119 struct loop_end : pegtl::opt<str_stop, ws_or_eof> {};
120 struct loop : pegtl::if_must<str_loop,
121 whitespace,
122 pegtl::plus<pegtl::seq<loop_tag, whitespace, pegtl::discard>>,
123 pegtl::sor<pegtl::plus<pegtl::seq<loop_value, ws_or_eof,
124 pegtl::discard>>,
125 // handle incorrect CIF with empty loop
126 pegtl::at<pegtl::sor<keyword, pegtl::eof>>>,
127 loop_end> {};
128 struct missing_value : pegtl::bol {};
129 struct dataitem : pegtl::if_must<item_tag, whitespace,
130 pegtl::if_then_else<item_value, ws_or_eof,
131 missing_value>,
132 pegtl::discard> {};
133 struct framename : pegtl::plus<nonblank_ch> {};
134 struct endframe : str_save {};
135 struct frame : pegtl::if_must<str_save, framename, whitespace,
136 pegtl::star<pegtl::sor<dataitem, loop>>,
137 endframe, ws_or_eof> {};
138 struct datablock : pegtl::seq<datablockheading, ws_or_eof,
139 pegtl::star<pegtl::sor<dataitem, loop, frame>>> {};
140 struct content : pegtl::plus<datablock> {};
141 struct file : pegtl::seq<pegtl::opt<whitespace>,
142 pegtl::if_must<pegtl::not_at<pegtl::eof>,
143 content, pegtl::eof>> {};
144 struct one_block : pegtl::seq<pegtl::opt<whitespace>,
145 pegtl::if_must<pegtl::not_at<pegtl::eof>, datablock>> {};
146
147
148} // namespace rules
149
150
151// **** error messages ****
152
153template<typename Rule> const std::string& error_message() {
154 static const std::string s = "parse error";
155 return s;
156}
157#define error_msg(rule, msg) \
158 template<> inline const std::string& error_message<rule>() { \
159 static const std::string s = msg; \
160 return s; \
161 }
162error_msg(rules::quoted_tail<pegtl::one<'\''>>, "unterminated 'string'")
163error_msg(rules::quoted_tail<pegtl::one<'"'>>, "unterminated \"string\"")
164error_msg(pegtl::until<rules::field_sep>, "unterminated text field")
165error_msg(rules::framename, "unnamed save_ frame")
166error_msg(rules::content, "expected block header (data_)")
167#undef error_msg
168
169template<typename Rule> struct Errors : public pegtl::normal<Rule> {
170 template<typename Input, typename ... States>
171 static void raise(const Input& in, States&& ...) {
172 throw pegtl::parse_error(error_message<Rule>()
173 //+ " matching " + pegtl::internal::demangle<Rule>()
174 , in);
175 }
176};
177
178// **** parsing actions that fill the storage ****
179
180template<typename Rule> struct Action : pegtl::nothing<Rule> {};
181
182// We don't store comments here. We don't have a proper storage for comments.
183// They can be stored as Items, but this leaves out comments before
184// the first block, comments inside loops, or between tag and value.
185// Additionally, a comment after a loop cannot be processed immediately
186// b/c at that point we don't know if the loop is finished yet.
187// If we were to store (a subset of) comments, we'd need to check first
188// how it affects performance.
189//template<> struct Action<rules::comment> {
190// template<typename Input> static void apply(const Input& in, Document& out) {
191// }
192//};
193
194template<> struct Action<rules::datablockname> {
195 template<typename Input> static void apply(const Input& in, Document& out) {
196 out.blocks.emplace_back(in.string());
197 Block& block = out.blocks.back();
198 // Empty block name (just data_ ) is not STAR/CIF conformant,
199 // but it's written by RELION and buccaneer; we must support it.
200 if (block.name.empty())
201 block.name += ' ';
202 out.items_ = &block.items;
203 }
204};
205template<> struct Action<rules::str_global> {
206 template<typename Input> static void apply(const Input&, Document& out) {
207 out.blocks.emplace_back();
208 out.items_ = &out.blocks.back().items;
209 }
210};
211template<> struct Action<rules::framename> {
212 template<typename Input> static void apply(const Input& in, Document& out) {
213 out.items_->emplace_back(FrameArg{in.string()});
214 out.items_->back().line_number = in.iterator().line;
215 out.items_ = &out.items_->back().frame.items;
216 }
217};
218template<> struct Action<rules::endframe> {
219 template<typename Input> static void apply(const Input&, Document& out) {
220 out.items_ = &out.blocks.back().items;
221 }
222};
223template<> struct Action<rules::item_tag> {
224 template<typename Input> static void apply(const Input& in, Document& out) {
225 out.items_->emplace_back(in.string());
226 out.items_->back().line_number = in.iterator().line;
227 }
228};
229template<> struct Action<rules::item_value> {
230 template<typename Input> static void apply(const Input& in, Document& out) {
231 Item& last_item = out.items_->back();
232 assert(last_item.type == ItemType::Pair);
233 last_item.pair[1] = in.string();
234 }
235};
236template<> struct Action<rules::str_loop> {
237 template<typename Input> static void apply(const Input& in, Document& out) {
238 out.items_->emplace_back(LoopArg{});
239 out.items_->back().line_number = in.iterator().line;
240 }
241};
242template<> struct Action<rules::loop_tag> {
243 template<typename Input> static void apply(const Input& in, Document& out) {
244 Item& last_item = out.items_->back();
245 assert(last_item.type == ItemType::Loop);
246 last_item.loop.tags.emplace_back(in.string());
247 }
248};
249template<> struct Action<rules::loop_value> {
250 template<typename Input> static void apply(const Input& in, Document& out) {
251 Item& last_item = out.items_->back();
252 assert(last_item.type == ItemType::Loop);
253 last_item.loop.values.emplace_back(in.string());
254 }
255};
256template<> struct Action<rules::loop> {
257 template<typename Input> static void apply(const Input& in, Document& out) {
258 Item& last_item = out.items_->back();
259 assert(last_item.type == ItemType::Loop);
260 const Loop& loop = last_item.loop;
261 if (loop.values.size() % loop.tags.size() != 0)
262 throw pegtl::parse_error(
263 "Wrong number of values in loop " + loop.common_prefix() + "*",
264 in);
265 }
266};
267
268
269template<typename Input> void parse_input(Document& d, Input&& in) {
270 pegtl::parse<rules::file, Action, Errors>(in, d);
271}
272
273template<typename Input> Document read_input(Input&& in) {
275 doc.source = in.source();
279 return doc;
280}
281
282template<typename Input>
284 pegtl::parse<rules::one_block, Action, Errors>(in, d);
285 return in.byte();
286}
287
288// pegtl::read_input may use mmap and be faster, but does not work
289// on Windows with Unicode filenames.
290#if defined(_WIN32)
291#define GEMMI_CIF_FILE_INPUT(in, path) \
292 tao::pegtl::read_input<> in(gemmi::file_open(path.c_str(), "rb").release(), path)
293#else
294#define GEMMI_CIF_FILE_INPUT(in, path) \
295 tao::pegtl::file_input<> in(path)
296#endif
297
298inline Document read_file(const std::string& filename) {
300 return read_input(in);
301}
302
303inline Document read_string(const std::string& data) {
304 pegtl::memory_input<> in(data, "string");
305 return read_input(in);
306}
307
308inline Document read_memory(const char* data, size_t size, const char* name) {
309 pegtl::memory_input<> in(data, size, name);
310 return read_input(in);
311}
312
313inline Document read_cstream(std::FILE *f, size_t bufsize, const char* name) {
314 pegtl::cstream_input<> in(f, bufsize, name);
315 return read_input(in);
316}
317
318inline Document read_istream(std::istream &is,
319 size_t bufsize, const char* name) {
320 pegtl::istream_input<> in(is, bufsize, name);
321 return read_input(in);
322}
323
324
325template<typename Rule> struct CheckAction : pegtl::nothing<Rule> {};
326
327template<> struct CheckAction<rules::missing_value> {
328 template<typename Input> static void apply(const Input& in) {
329 throw pegtl::parse_error("tag without value", in);
330 }
331};
332
333template<typename Input> bool check_syntax(Input&& in, std::string* msg) {
334 try {
335 return pegtl::parse<rules::file, CheckAction, Errors>(in);
336 } catch (pegtl::parse_error& e) {
337 if (msg)
338 *msg = e.what();
339 return false;
340 }
341}
342
343// A function for transparent reading of normal and compressed files.
344// T should have the same traits as BasicInput and MaybeGzipped.
345template<typename T>
347 if (input.is_stdin())
348 return read_cstream(stdin, 16*1024, "stdin");
349 if (CharArray mem = input.uncompress_into_buffer())
350 return read_memory(mem.data(), mem.size(), input.path().c_str());
351 return read_file(input.path());
352}
353
354template<typename T>
355bool check_syntax_any(T&& input, std::string* msg) {
356 if (CharArray mem = input.uncompress_into_buffer()) {
357 pegtl::memory_input<> in(mem.data(), mem.size(), input.path());
358 return check_syntax(in, msg);
359 }
361 return check_syntax(in, msg);
362}
363
364template<typename T>
365size_t read_one_block(Document& d, T&& input, size_t limit) {
366 if (input.is_stdin())
367 return parse_one_block(d, pegtl::cstream_input<>(stdin, 16*1024, "stdin"));
368 if (input.is_compressed()) {
369 CharArray mem = input.uncompress_into_buffer(limit);
370 return parse_one_block(d, pegtl::memory_input<>(mem.data(), mem.size(),
371 input.path().c_str()));
372 }
374 return parse_one_block(d, std::move(in));
375}
376
377#if defined(_MSC_VER)
378#pragma warning(pop)
379#endif
380
381} // namespace cif
382} // namespace gemmi
383#endif
#define error_msg(rule, msg)
Definition cif.hpp:157
#define GEMMI_CIF_FILE_INPUT(in, path)
Definition cif.hpp:294
void check_for_missing_values(const Document &d)
Definition cifdoc.hpp:1118
Document read_istream(std::istream &is, size_t bufsize, const char *name)
Definition cif.hpp:318
Document read_string(const std::string &data)
Definition cif.hpp:303
const std::string & error_message()
Definition cif.hpp:153
Document read_input(Input &&in)
Definition cif.hpp:273
size_t parse_one_block(Document &d, Input &&in)
Definition cif.hpp:283
bool check_syntax(Input &&in, std::string *msg)
Definition cif.hpp:333
Document read_cstream(std::FILE *f, size_t bufsize, const char *name)
Definition cif.hpp:313
void check_for_duplicates(const Document &d)
Definition cifdoc.hpp:1124
Document read(T &&input)
Definition cif.hpp:346
void parse_input(Document &d, Input &&in)
Definition cif.hpp:269
bool check_syntax_any(T &&input, std::string *msg)
Definition cif.hpp:355
Document read_file(const std::string &filename)
Definition cif.hpp:298
uint8_t char_table(char c)
Definition cifdoc.hpp:45
size_t read_one_block(Document &d, T &&input, size_t limit)
Definition cif.hpp:365
Document read_memory(const char *data, size_t size, const char *name)
Definition cif.hpp:308