Gemmi C++ API
Loading...
Searching...
No Matches
cif.hpp
Go to the documentation of this file.
1// Copyright 2017 Global Phasing Ltd.
2//
3// CIF parser (based on PEGTL) with pluggable actions,
4// and a set of actions that prepare Document.
5// To just read the CIF format, include read_cif.hpp instead.
6
7#ifndef GEMMI_CIF_HPP_
8#define GEMMI_CIF_HPP_
9#include <cassert>
10#include <cstdio> // for FILE
11#include <iosfwd> // for size_t, istream
12#include <string>
13
14#include "third_party/tao/pegtl.hpp"
15//#include "third_party/tao/pegtl/contrib/tracer.hpp" // for debugging
16
17#include "cifdoc.hpp" // for Document, etc
18#include "fileutil.hpp" // for CharArray, file_open
19
20#if defined(_MSC_VER)
21#pragma warning(push)
22// warning C4244: an integer type is converted to a smaller integer type
23#pragma warning(disable: 4244)
24// warning C4267: conversion from 'size_t' to 'type', possible loss of data
25#pragma warning(disable: 4267)
26#endif
27
28namespace gemmi {
29namespace cif {
30using std::size_t;
31namespace pegtl = tao::pegtl;
32
33
34// **** grammar rules, named similarly as in the CIF 1.1 spec ****
35namespace rules {
36
37 template<int TableVal> struct lookup_char {
38 using analyze_t = pegtl::analysis::generic<pegtl::analysis::rule_type::ANY>;
39 template<typename Input> static bool match(Input& in) {
40 if (!in.empty() && cif::char_table(in.peek_char()) == TableVal) {
41 if (TableVal == 2) // this set includes new-line
42 in.bump(1);
43 else
44 in.bump_in_this_line(1);
45 return true;
46 }
47 return false;
48 }
49 };
50
51 // (letter) refers to sections in Table 2.2.7.1 in Vol.G of ITfC (2006).
52
53 // (g) Character sets.
54 // OrdinaryCharacter: ! % & ()*+,-./0-9: <=>?@A-Z[] \ ^ `a-z{|}~
55 using ordinary_char = lookup_char<1>;
56
57 using ws_char = lookup_char<2>;
58
59 // !"#$%&'()*+,-./0-9:;<=>?@A-Z[\]^_`a-z{|}~
60 struct nonblank_ch : pegtl::range<'!', '~'> {};
61
62 // ascii space is just before '!'
63 struct anyprint_ch : pegtl::ranges<' ', '~', '\t'> {};
64
65
66 // (f) White space and comments.
67 struct comment : pegtl::if_must<pegtl::one<'#'>, pegtl::until<pegtl::eolf>>{};
68 struct whitespace : pegtl::plus<pegtl::sor<ws_char, comment>> {};
69 struct ws_or_eof : pegtl::sor<whitespace, pegtl::eof> {};
70
71 // (b) Reserved words.
72 struct str_data : TAOCPP_PEGTL_ISTRING("data_") {};
73 struct str_loop : TAOCPP_PEGTL_ISTRING("loop_") {};
74 struct str_global : TAOCPP_PEGTL_ISTRING("global_") {};
75 struct str_save : TAOCPP_PEGTL_ISTRING("save_") {};
76 struct str_stop : TAOCPP_PEGTL_ISTRING("stop_") {};
77 struct keyword : pegtl::sor<str_data, str_loop, str_global,
78 str_save, str_stop> {};
79
80 // (e) Character strings and text fields.
81 template<typename Q>
82 struct endq : pegtl::seq<Q, pegtl::at<pegtl::sor<
83 pegtl::one<' ','\n','\r','\t','#'>,
84 pegtl::eof>>> {};
85 // strict rule would be:
86 // template <typename Q> struct quoted_tail : until<endq<Q>, anyprint_ch> {};
87 // but it was relaxed after PDB accepted 5q1h with non-ascii character
88 template<typename Q>
89 struct quoted_tail : pegtl::until<endq<Q>, pegtl::not_one<'\n'>> {};
90 template<typename Q>
91 struct quoted : pegtl::if_must<Q, quoted_tail<Q>> {};
92 struct singlequoted : quoted<pegtl::one<'\''>> {};
93 struct doublequoted : quoted<pegtl::one<'"'>> {};
94 struct field_sep : pegtl::seq<pegtl::bol, pegtl::one<';'>> {};
95 // CIF 2.0 requires whitespace after text field, so it'd be:
96 // until<endq<field_sep>> instead of until<field_sep>.
97 struct textfield : pegtl::if_must<field_sep, pegtl::until<field_sep>> {};
98 struct unquoted : pegtl::seq<pegtl::not_at<keyword>,
99 pegtl::not_at<pegtl::one<'_','$','#'>>,
100 pegtl::plus<nonblank_ch>> {};
101
102 // (a) Basic structure of CIF. (c) Tags and values.
103 // datablockname in STAR/CIF should not be empty, but we made an exception
104 // for RELION which writes blocks starting with bare data_
105 struct datablockname : pegtl::star<nonblank_ch> {};
106 struct datablockheading : pegtl::sor<pegtl::seq<str_data, datablockname>, str_global> {};
107 struct tag : pegtl::seq<pegtl::one<'_'>, pegtl::plus<nonblank_ch>> {};
108 // unquoted value made of ordinary characters only - for a typical mmCIF file
109 // it is faster to check it first even if we backtrack on some values_.
110 struct simunq : pegtl::seq<pegtl::plus<ordinary_char>, pegtl::at<ws_char>> {};
111 struct value : pegtl::sor<simunq, singlequoted, doublequoted,
112 textfield, unquoted> {};
113 struct item_tag : tag {};
114 struct item_value : value {};
115 struct loop_tag : tag {};
116 struct loop_value : value {};
117 struct loop_end : pegtl::opt<str_stop, ws_or_eof> {};
118 struct loop : pegtl::if_must<str_loop,
119 whitespace,
120 pegtl::plus<pegtl::seq<loop_tag, whitespace, pegtl::discard>>,
121 pegtl::sor<pegtl::plus<pegtl::seq<loop_value, ws_or_eof,
122 pegtl::discard>>,
123 // handle incorrect CIF with empty loop
124 pegtl::at<pegtl::sor<keyword, pegtl::eof>>>,
125 loop_end> {};
126 struct missing_value : pegtl::bol {};
127 struct dataitem : pegtl::if_must<item_tag, whitespace,
128 pegtl::if_then_else<item_value, ws_or_eof,
129 missing_value>,
130 pegtl::discard> {};
131 struct framename : pegtl::plus<nonblank_ch> {};
132 struct endframe : str_save {};
133 struct frame : pegtl::if_must<str_save, framename, whitespace,
134 pegtl::star<pegtl::sor<dataitem, loop>>,
135 endframe, ws_or_eof> {};
136 struct datablock : pegtl::seq<datablockheading, ws_or_eof,
137 pegtl::star<pegtl::sor<dataitem, loop, frame>>> {};
138 struct content : pegtl::plus<datablock> {};
139 struct file : pegtl::seq<pegtl::opt<whitespace>,
140 pegtl::if_must<pegtl::not_at<pegtl::eof>,
141 content, pegtl::eof>> {};
142 struct one_block : pegtl::seq<pegtl::opt<whitespace>,
143 pegtl::if_must<pegtl::not_at<pegtl::eof>, datablock>> {};
144
145
146} // namespace rules
147
148
149// **** error messages ****
150
151template<typename Rule> const std::string& error_message() {
152 static const std::string s = "parse error";
153 return s;
154}
155#define error_msg(rule, msg) \
156 template<> inline const std::string& error_message<rule>() { \
157 static const std::string s = msg; \
158 return s; \
159 }
160error_msg(rules::quoted_tail<pegtl::one<'\''>>, "unterminated 'string'")
161error_msg(rules::quoted_tail<pegtl::one<'"'>>, "unterminated \"string\"")
162error_msg(pegtl::until<rules::field_sep>, "unterminated text field")
163error_msg(rules::framename, "unnamed save_ frame")
164error_msg(rules::content, "expected block header (data_)")
165#undef error_msg
166
167template<typename Rule> struct Errors : public pegtl::normal<Rule> {
168 template<typename Input, typename ... States>
169 static void raise(const Input& in, States&& ...) {
170 throw pegtl::parse_error(error_message<Rule>()
171 //+ " matching " + pegtl::internal::demangle<Rule>()
172 , in);
173 }
174};
175
176// **** parsing actions that fill the storage ****
177
178template<typename Rule> struct Action : pegtl::nothing<Rule> {};
179
180// We don't store comments here. We don't have a proper storage for comments.
181// They can be stored as Items, but this leaves out comments before
182// the first block, comments inside loops, or between tag and value.
183// Additionally, a comment after a loop cannot be processed immediately
184// b/c at that point we don't know if the loop is finished yet.
185// If we were to store (a subset of) comments, we'd need to check first
186// how it affects performance.
187//template<> struct Action<rules::comment> {
188// template<typename Input> static void apply(const Input& in, Document& out) {
189// }
190//};
191
192template<> struct Action<rules::datablockname> {
193 template<typename Input> static void apply(const Input& in, Document& out) {
194 out.blocks.emplace_back(in.string());
195 Block& block = out.blocks.back();
196 // Empty block name (just data_ ) is not STAR/CIF conformant,
197 // but it's written by RELION and buccaneer; we must support it.
198 if (block.name.empty())
199 block.name += ' ';
200 out.items_ = &block.items;
201 }
202};
203template<> struct Action<rules::str_global> {
204 template<typename Input> static void apply(const Input&, Document& out) {
205 out.blocks.emplace_back();
206 out.items_ = &out.blocks.back().items;
207 }
208};
209template<> struct Action<rules::framename> {
210 template<typename Input> static void apply(const Input& in, Document& out) {
211 out.items_->emplace_back(FrameArg{in.string()});
212 out.items_->back().line_number = in.iterator().line;
213 out.items_ = &out.items_->back().frame.items;
214 }
215};
216template<> struct Action<rules::endframe> {
217 template<typename Input> static void apply(const Input&, Document& out) {
218 out.items_ = &out.blocks.back().items;
219 }
220};
221template<> struct Action<rules::item_tag> {
222 template<typename Input> static void apply(const Input& in, Document& out) {
223 out.items_->emplace_back(in.string());
224 out.items_->back().line_number = in.iterator().line;
225 }
226};
227template<> struct Action<rules::item_value> {
228 template<typename Input> static void apply(const Input& in, Document& out) {
229 Item& last_item = out.items_->back();
230 assert(last_item.type == ItemType::Pair);
231 last_item.pair[1] = in.string();
232 }
233};
234template<> struct Action<rules::str_loop> {
235 template<typename Input> static void apply(const Input& in, Document& out) {
236 out.items_->emplace_back(LoopArg{});
237 out.items_->back().line_number = in.iterator().line;
238 }
239};
240template<> struct Action<rules::loop_tag> {
241 template<typename Input> static void apply(const Input& in, Document& out) {
242 Item& last_item = out.items_->back();
243 assert(last_item.type == ItemType::Loop);
244 last_item.loop.tags.emplace_back(in.string());
245 }
246};
247template<> struct Action<rules::loop_value> {
248 template<typename Input> static void apply(const Input& in, Document& out) {
249 Item& last_item = out.items_->back();
250 assert(last_item.type == ItemType::Loop);
251 last_item.loop.values.emplace_back(in.string());
252 }
253};
254template<> struct Action<rules::loop> {
255 template<typename Input> static void apply(const Input& in, Document& out) {
256 Item& last_item = out.items_->back();
257 assert(last_item.type == ItemType::Loop);
258 const Loop& loop = last_item.loop;
259 if (loop.values.size() % loop.tags.size() != 0)
260 throw pegtl::parse_error(
261 "Wrong number of values in loop " + loop.common_prefix() + "*",
262 in);
263 }
264};
265
266
267template<typename Input> void parse_input(Document& d, Input&& in) {
268 pegtl::parse<rules::file, Action, Errors>(in, d);
269}
270
271template<typename Input> Document read_input(Input&& in, int check_level=1) {
273 doc.source = in.source();
275 if (check_level > 0) {
278 if (check_level > 1) {
279 for (const cif::Block& block : doc.blocks) {
280 if (block.name == " ")
281 fail(doc.source + ": missing block name (bare data_)");
282 check_empty_loops(block, doc.source);
283 }
284 }
285 }
286 return doc;
287}
288
289template<typename Input>
291 pegtl::parse<rules::one_block, Action, Errors>(in, d);
292 return in.byte();
293}
294
295// pegtl::file_input may use mmap and be faster, but does not work
296// on Windows with Unicode filenames.
297#if defined(_WIN32)
298#define GEMMI_CIF_FILE_INPUT(in, path) \
299 tao::pegtl::read_input<> in(gemmi::file_open(path.c_str(), "rb").release(), path)
300#else
301#define GEMMI_CIF_FILE_INPUT(in, path) \
302 tao::pegtl::file_input<> in(path)
303#endif
304
305inline Document read_file(const std::string& filename, int check_level=1) {
307 return read_input(in, check_level);
308}
309
310inline Document read_memory(const char* data, size_t size, const char* name, int check_level=1) {
311 pegtl::memory_input<> in(data, size, name);
312 return read_input(in, check_level);
313}
314
315inline Document read_cstream(std::FILE *f, size_t bufsize, const char* name, int check_level=1) {
316 pegtl::cstream_input<> in(f, bufsize, name);
317 return read_input(in, check_level);
318}
319
320inline Document read_istream(std::istream &is, size_t bufsize, const char* name,
321 int check_level=1) {
322 pegtl::istream_input<> in(is, bufsize, name);
323 return read_input(in, check_level);
324}
325
326
327template<typename Rule> struct CheckAction : pegtl::nothing<Rule> {};
328
329template<> struct CheckAction<rules::missing_value> {
330 template<typename Input> static void apply(const Input& in) {
331 throw pegtl::parse_error("tag without value", in);
332 }
333};
334
335template<typename Input> bool try_parse(Input&& in, std::string* msg) {
336 try {
337 return pegtl::parse<rules::file, CheckAction, Errors>(in);
338 } catch (pegtl::parse_error& e) {
339 if (msg)
340 *msg = e.what();
341 return false;
342 }
343}
344
345// A function for transparent reading of normal and compressed files.
346// T should have the same traits as BasicInput and MaybeGzipped.
347template<typename T>
349 if (CharArray mem = input.uncompress_into_buffer())
350 return read_memory(mem.data(), mem.size(), input.path().c_str(), check_level);
351 if (input.is_stdin())
352 return read_cstream(stdin, 16*1024, "stdin", check_level);
353 return read_file(input.path(), check_level);
354}
355
356template<typename T>
357bool check_syntax(T&& input, std::string* msg) {
358 if (CharArray mem = input.uncompress_into_buffer()) {
359 pegtl::memory_input<> in(mem.data(), mem.size(), input.path());
360 return try_parse(in, msg);
361 }
363 return try_parse(in, msg);
364}
365
366template<typename T>
367size_t read_one_block(Document& d, T&& input, size_t limit) {
368 if (input.is_compressed()) {
369 CharArray mem = input.uncompress_into_buffer(limit);
370 return parse_one_block(d, pegtl::memory_input<>(mem.data(), mem.size(),
371 input.path().c_str()));
372 }
373 if (input.is_stdin())
374 return parse_one_block(d, pegtl::cstream_input<>(stdin, 16*1024, "stdin"));
376 return parse_one_block(d, std::move(in));
377}
378
379#if defined(_MSC_VER)
380#pragma warning(pop)
381#endif
382
383} // namespace cif
384} // namespace gemmi
385#endif
#define error_msg(rule, msg)
Definition cif.hpp:155
#define GEMMI_CIF_FILE_INPUT(in, path)
Definition cif.hpp:301
struct Document that represents the CIF file (but can also be read from a different representation,...
File-related utilities.
Document read(T &&input, int check_level=1)
Definition cif.hpp:348
void check_for_missing_values(const Document &d)
Definition cifdoc.hpp:1123
Document read_cstream(std::FILE *f, size_t bufsize, const char *name, int check_level=1)
Definition cif.hpp:315
bool try_parse(Input &&in, std::string *msg)
Definition cif.hpp:335
Document read_input(Input &&in, int check_level=1)
Definition cif.hpp:271
const std::string & error_message()
Definition cif.hpp:151
size_t parse_one_block(Document &d, Input &&in)
Definition cif.hpp:290
bool check_syntax(T &&input, std::string *msg)
Definition cif.hpp:357
void check_empty_loops(const cif::Block &block, const std::string &source)
Definition cifdoc.hpp:1164
Document read_istream(std::istream &is, size_t bufsize, const char *name, int check_level=1)
Definition cif.hpp:320
void check_for_duplicates(const Document &d)
Definition cifdoc.hpp:1129
Document read_file(const std::string &filename, int check_level=1)
Definition cif.hpp:305
void parse_input(Document &d, Input &&in)
Definition cif.hpp:267
uint8_t char_table(char c)
Definition cifdoc.hpp:44
Document read_memory(const char *data, size_t size, const char *name, int check_level=1)
Definition cif.hpp:310
size_t read_one_block(Document &d, T &&input, size_t limit)
Definition cif.hpp:367
void fail(const std::string &msg)
Definition fail.hpp:59