Gemmi C++ API
Loading...
Searching...
No Matches
select.hpp
Go to the documentation of this file.
1// Copyright 2018 Global Phasing Ltd.
2//
3// Selections.
4
5#ifndef GEMMI_SELECT_HPP_
6#define GEMMI_SELECT_HPP_
7
8#include <string>
9#include <cstdlib> // for strtol
10#include <cctype> // for isalpha
11#include <climits> // for INT_MIN, INT_MAX
12#include "fail.hpp" // for fail
13#include "util.hpp" // for is_in_list
14#include "model.hpp" // for Model, Chain, etc
15#include "iterator.hpp" // for FilterProxy
16#include "sprintf.hpp" // for to_str
17#include "atof.hpp" // for fast_from_chars
18
19namespace gemmi {
20
21// from http://www.ccp4.ac.uk/html/pdbcur.html
22// Specification of the selection sets:
23// either
24// /mdl/chn/s1.i1-s2.i2/at[el]:aloc
25// or
26// /mdl/chn/*(res).ic/at[el]:aloc
27//
28
29struct Selection {
30 struct List {
31 bool all = true;
32 bool inverted = false;
33 std::string list; // comma-separated
34
35 std::string str() const {
36 if (all)
37 return "*";
38 return inverted ? "!" + list : list;
39 }
40
41 bool has(const std::string& name) const {
42 if (all)
43 return true;
44 bool found = is_in_list(name, list);
45 return inverted ? !found : found;
46 }
47 };
48
49 struct FlagList {
50 std::string pattern;
51 bool has(char flag) const {
52 if (pattern.empty())
53 return true;
54 bool invert = (pattern[0] == '!');
55 bool found = (pattern.find(flag, invert ? 1 : 0) != std::string::npos);
56 return invert ? !found : found;
57 }
58 };
59
60 struct SequenceId {
61 int seqnum;
62 char icode;
63
64 bool empty() const {
65 return seqnum == INT_MIN || seqnum == INT_MAX;
66 }
67
68 std::string str() const {
69 std::string s;
70 if (!empty()) {
71 s = std::to_string(seqnum);
72 if (icode != '*') {
73 s += '.';
74 if (icode != ' ')
75 s += icode;
76 }
77 }
78 return s;
79 }
80
81 int compare(const SeqId& seqid) const {
82 if (seqnum != *seqid.num)
83 return seqnum < *seqid.num ? -1 : 1;
84 if (icode != '*' && icode != seqid.icode)
85 return icode < seqid.icode ? -1 : 1;
86 return 0;
87 }
88 };
89
93 double value;
94
95 bool matches(const Atom& a) const {
96 double atom_value = 0.;
97 if (property == 'q')
98 atom_value = a.occ;
99 else if (property == 'b')
100 atom_value = a.b_iso;
101 if (relation < 0)
102 return atom_value < value;
103 if (relation > 0)
104 return atom_value > value;
105 return atom_value == value;
106 }
107
108 std::string str() const {
109 std::string r = ";";
110 r += property;
111 r += relation == 0 ? '=' : relation < 0 ? '<' : '>';
112 r += to_str(value);
113 return r;
114 }
115 };
116
117 int mdl = 0; // 0 = all
123 // array corresponding to enum EntityType
124 std::array<char, 6> et_flags;
126 std::vector<char> elements;
130 std::vector<AtomInequality> atom_inequalities;
131
132 Selection() = default;
133 Selection(const std::string& cid);
134
135 std::string str() const {
136 std::string cid = "/";
137 if (mdl != 0)
138 cid += std::to_string(mdl);
139 cid += '/';
140 cid += chain_ids.str();
141 cid += '/';
142 cid += from_seqid.str();
143 if (!residue_names.all) {
144 cid += '(';
145 cid += residue_names.str();
146 cid += ')';
147 }
148 if ((!from_seqid.empty() || !to_seqid.empty()) &&
150 cid += '-';
151 cid += to_seqid.str();
152 }
153 cid += '/';
154 if (!atom_names.all)
155 cid += atom_names.str();
156 if (!elements.empty()) {
157 cid += '[';
158 bool inv = (std::count(elements.begin(), elements.end(), 1) > 64);
159 if (inv)
160 cid += '!';
161 for (size_t i = 0; i < elements.size(); ++i)
162 if (elements[i] != char(inv)) {
163 cid += element_name(static_cast<El>(i));
164 cid += ',';
165 }
166 cid.back() = ']';
167 }
168 if (!altlocs.all) {
169 cid += ':';
170 cid += altlocs.str();
171 }
172 if (!entity_types.all) {
173 cid += ';';
174 cid += entity_types.str();
175 }
176 for (const AtomInequality& ai : atom_inequalities)
177 cid += ai.str();
178 return cid;
179 }
180
181 bool matches(const Structure&) const { return true; }
182 bool matches(const Model& model) const {
183 return mdl == 0 || std::to_string(mdl) == model.name;
184 }
185 bool matches(const Chain& chain) const {
186 return chain_ids.has(chain.name);
187 }
188 bool matches(const Residue& res) const {
189 return (entity_types.all || et_flags[(int)res.entity_type]) &&
190 residue_names.has(res.name) &&
191 from_seqid.compare(res.seqid) <= 0 &&
192 to_seqid.compare(res.seqid) >= 0 &&
194 }
195 bool matches(const Atom& a) const {
196 return atom_names.has(a.name) &&
197 (elements.empty() || elements[a.element.ordinal()]) &&
198 (altlocs.all || altlocs.has(std::string(a.altloc ? 1 : 0, a.altloc))) &&
199 atom_flags.has(a.flag) &&
200 std::all_of(atom_inequalities.begin(), atom_inequalities.end(),
201 [&](const AtomInequality& i) { return i.matches(a); });
202 }
203 bool matches(const CRA& cra) const {
204 return (cra.chain == nullptr || matches(*cra.chain)) &&
205 (cra.residue == nullptr || matches(*cra.residue)) &&
206 (cra.atom == nullptr || matches(*cra.atom));
207 }
208
210 return {*this, st.models};
211 }
213 return {*this, model.chains};
214 }
216 return {*this, chain.residues};
217 }
219 return {*this, residue.atoms};
220 }
221
222 CRA first_in_model(Model& model) const {
223 if (matches(model))
224 for (Chain& chain : model.chains) {
225 if (matches(chain))
226 for (Residue& res : chain.residues) {
227 if (matches(res))
228 for (Atom& atom : res.atoms) {
229 if (matches(atom))
230 return {&chain, &res, &atom};
231 }
232 }
233 }
234 return {nullptr, nullptr, nullptr};
235 }
236
237 std::pair<Model*, CRA> first(Structure& st) const {
238 for (Model& model : st.models) {
239 CRA cra = first_in_model(model);
240 if (cra.chain)
241 return {&model, cra};
242 }
243 return {nullptr, {nullptr, nullptr, nullptr}};
244 }
245
246 template<typename T>
247 void add_matching_children(const T& orig, T& target) const {
248 for (const auto& orig_child : orig.children())
249 if (matches(orig_child)) {
250 target.children().push_back(orig_child.empty_copy());
251 add_matching_children(orig_child, target.children().back());
252 }
253 }
254 void add_matching_children(const Atom&, Atom&) const {}
255
256 Selection& set_residue_flags(const std::string& pattern) {
257 residue_flags.pattern = pattern;
258 return *this;
259 }
260 Selection& set_atom_flags(const std::string& pattern) {
261 atom_flags.pattern = pattern;
262 return *this;
263 }
264
265 template<typename T>
266 T copy_selection(const T& orig) const {
267 T copied = orig.empty_copy();
269 return copied;
270 }
271
272 template<typename T>
273 void remove_selected(T& t) const {
274 for (auto& child : t.children())
275 if (matches(child))
277 vector_remove_if(t.children(),
278 [&](typename T::child_type& c) { return c.children().empty(); });
279 }
280 void remove_selected(Residue& res) const {
281 if (atom_names.all && elements.empty() && altlocs.all &&
282 atom_flags.pattern.empty() && atom_inequalities.empty())
283 res.atoms.clear();
284 else
285 vector_remove_if(res.atoms, [&](Atom& c) { return matches(c); });
286 }
287
288 template<typename T>
289 void remove_not_selected(T& t) const {
290 vector_remove_if(t.children(), [&](typename T::child_type& c) { return !matches(c); });
291 for (auto& child : t.children())
293 }
294 void remove_not_selected(Atom&) const {}
295};
296
297namespace impl {
298
299[[noreturn]]
300inline GEMMI_COLD void wrong_syntax(const std::string& cid, size_t pos,
301 const char* info=nullptr) {
302 std::string msg = "Invalid selection syntax";
303 if (info)
304 msg += info;
305 if (pos != 0)
306 cat_to(msg, " near \"", cid.substr(pos, 8), '"');
307 cat_to(msg, ": ", cid);
308 fail(msg);
309}
310
311inline int determine_omitted_cid_fields(const std::string& cid) {
312 if (cid[0] == '/')
313 return 0; // model
314 if (std::isdigit(cid[0]) || cid[0] == '.' || cid[0] == '(' || cid[0] == '-')
315 return 2; // residue
316 size_t sep = cid.find_first_of("/([:;");
317 if (sep == std::string::npos || cid[sep] == '/' || cid[sep] == ';')
318 return 1; // chain
319 if (cid[sep] == '(')
320 return 2; // residue
321 return 3; // atom
322}
323
324inline Selection::List make_cid_list(const std::string& cid, size_t pos, size_t end,
325 const char* disallowed_chars="-[]()!/*.:;") {
326 Selection::List list;
327 list.all = (cid[pos] == '*');
328 list.inverted = (cid[pos] == '!');
329 if (list.all || list.inverted)
330 ++pos;
331 list.list = cid.substr(pos, end - pos);
332 // if a list have punctuation other than ',' something must be wrong
333 size_t idx = list.list.find_first_of(disallowed_chars);
334 if (idx != std::string::npos)
335 wrong_syntax(cid, pos + idx, cat(" ('", list.list[idx], "' in a list)").c_str());
336 return list;
337}
338
339inline void parse_cid_elements(const std::string& cid, size_t pos,
340 std::vector<char>& elements) {
341 elements.clear(); // just in case
342 if (cid[pos] == '*')
343 return;
344 bool inverted = false;
345 if (cid[pos] == '!') {
346 inverted = true;
347 ++pos;
348 }
349 elements.resize((size_t)El::END, char(inverted));
350 for (;;) {
351 size_t sep = cid.find_first_of(",]", pos);
352 if (sep == pos || sep > pos + 2)
353 wrong_syntax(cid, 0, "in [...]");
354 char elem_str[2] = {cid[pos], sep > pos+1 ? cid[pos+1] : '\0'};
355 Element el = find_element(elem_str);
356 if (el == El::X && (alpha_up(elem_str[0]) != 'X' || elem_str[1] != '\0'))
357 wrong_syntax(cid, 0, " (invalid element in [...])");
358 elements[el.ordinal()] = char(!inverted);
359 pos = sep + 1;
360 if (cid[sep] == ']')
361 break;
362 }
363}
364
365inline Selection::SequenceId parse_cid_seqid(const std::string& cid, size_t& pos,
366 int default_seqnum) {
367 size_t initial_pos = pos;
368 int seqnum = default_seqnum;
369 char icode = ' ';
370 if (cid[pos] == '*') {
371 ++pos;
372 icode = '*';
373 } else if (std::isdigit(cid[pos])) {
374 char* endptr;
375 seqnum = std::strtol(&cid[pos], &endptr, 10);
376 pos = endptr - &cid[0];
377 }
378 if (cid[pos] == '.')
379 ++pos;
380 if (initial_pos != pos && (std::isalpha(cid[pos]) || cid[pos] == '*'))
381 icode = cid[pos++];
382 return {seqnum, icode};
383}
384
385inline Selection::AtomInequality parse_atom_inequality(const std::string& cid,
386 size_t pos, size_t end) {
387 Selection::AtomInequality r;
388 if (cid[pos] != 'q' && cid[pos] != 'b')
389 wrong_syntax(cid, pos);
390 r.property = cid[pos];
391 ++pos;
392 while (cid[pos] == ' ')
393 ++pos;
394 if (cid[pos] == '<')
395 r.relation = -1;
396 else if (cid[pos] == '>')
397 r.relation = 1;
398 else if (cid[pos] == '=')
399 r.relation = 0;
400 else
401 wrong_syntax(cid, pos);
402 ++pos;
403 auto result = fast_from_chars(cid.c_str() + pos, r.value);
404 if (result.ec != std::errc())
405 wrong_syntax(cid, pos, " (expected number)");
406 pos = size_t(result.ptr - cid.c_str());
407 while (cid[pos] == ' ')
408 ++pos;
409 if (pos != end)
410 wrong_syntax(cid, pos);
411 return r;
412}
413
414inline bool has_inequality(const std::string& cid, size_t start, size_t end) {
415 for (size_t i = start; i < end; ++i)
416 if (cid[i] == '<' || cid[i] == '=' || cid[i] == '>')
417 return true;
418 return false;
419}
420
421inline void parse_cid(const std::string& cid, Selection& sel) {
422 if (cid.empty() || (cid.size() == 1 && cid[0] == '*'))
423 return;
424 int omit = determine_omitted_cid_fields(cid);
425 size_t sep = 0;
426 size_t semi = cid.find(';');
427 // model
428 if (omit == 0) {
429 sep = std::min(cid.find('/', 1), semi);
430 if (sep != 1 && cid[1] != '*') {
431 char* endptr;
432 sel.mdl = std::strtol(&cid[1], &endptr, 10);
433 size_t end_pos = endptr - &cid[0];
434 if (end_pos != sep && end_pos != cid.size())
435 wrong_syntax(cid, 0, " (at model number)");
436 }
437 }
438
439 // chain
440 if (omit <= 1 && sep < semi) {
441 size_t pos = (sep == 0 ? 0 : sep + 1);
442 sep = std::min(cid.find('/', pos), semi);
443 // These characters are not really disallowed, but are unexpected.
444 // "-" is expected, it's in chain IDs in bioassembly files from RCSB.
445 const char* disallowed_chars = "[]()!/*.:;";
446 sel.chain_ids = make_cid_list(cid, pos, sep, disallowed_chars);
447 }
448
449 // residue; MMDB CID syntax: s1.i1-s2.i2 or *(res).ic
450 // In gemmi both 14.a and 14a are accepted.
451 // *(ALA). and *(ALA) and (ALA). can be used instead of (ALA) for
452 // compatibility with MMDB.
453 if (omit <= 2 && sep < semi) {
454 size_t pos = (sep == 0 ? 0 : sep + 1);
455 if (cid[pos] != '(')
456 sel.from_seqid = parse_cid_seqid(cid, pos, INT_MIN);
457 if (cid[pos] == '(') {
458 ++pos;
459 size_t right_br = cid.find(')', pos);
460 sel.residue_names = make_cid_list(cid, pos, right_br);
461 pos = right_br + 1;
462 }
463 // allow "(RES)." and "(RES).*" and "(RES)*"
464 if (cid[pos] == '.')
465 ++pos;
466 if (cid[pos] == '*')
467 ++pos;
468 if (cid[pos] == '-') {
469 ++pos;
470 sel.to_seqid = parse_cid_seqid(cid, pos, INT_MAX);
471 } else if (sel.from_seqid.seqnum != INT_MIN) {
472 sel.to_seqid = sel.from_seqid;
473 }
474 sep = pos;
475 if (cid[sep] != '/' && cid[sep] != ';' && cid[sep] != '\0')
476 wrong_syntax(cid, 0);
477 }
478
479 // atom; at[el]:aloc
480 if (sep < std::min(cid.size(), semi)) {
481 size_t pos = (sep == 0 ? 0 : sep + 1);
482 size_t end = cid.find_first_of("[:;", pos);
483 if (end != pos) {
484 sel.atom_names = make_cid_list(cid, pos, end);
485 // Chain name can be empty, but not atom name,
486 // so we interpret empty atom name as *.
487 if (!sel.atom_names.inverted && sel.atom_names.list.empty())
488 sel.atom_names.all = true;
489 if (end == std::string::npos)
490 return;
491 }
492 if (cid[end] == '[') {
493 pos = end + 1;
494 end = cid.find(']', pos);
495 if (end == std::string::npos)
496 wrong_syntax(cid, 0, " (no matching ']')");
497 parse_cid_elements(cid, pos, sel.elements);
498 ++end;
499 }
500 if (cid[end] == ':') {
501 pos = end + 1;
502 sel.altlocs = make_cid_list(cid, pos, semi);
503 }
504 }
505
506 // extensions after semicolon(s)
507 while (semi < cid.size()) {
508 size_t pos = semi + 1;
509 while (cid[pos] == ' ')
510 ++pos;
511 semi = std::min(cid.find(';', pos), cid.size());
512 size_t end = semi;
513 while (end > pos && cid[end-1] == ' ')
514 --end;
515 if (has_inequality(cid, pos, end)) {
516 sel.atom_inequalities.push_back(parse_atom_inequality(cid, pos, end));
517 } else {
518 sel.entity_types = make_cid_list(cid, pos, end);
519 bool inv = sel.entity_types.inverted;
520 std::fill(sel.et_flags.begin(), sel.et_flags.end(), char(inv));
521 for (const std::string& item : split_str(sel.entity_types.list, ',')) {
523 if (item == "polymer")
525 else if (item == "solvent")
527 else
528 wrong_syntax(cid, 0, (" at " + item).c_str());
529 sel.et_flags[(int)et] = char(!inv);
530 }
531 }
532 }
533}
534
535} // namespace impl
536
537
538inline Selection::Selection(const std::string& cid) {
539 impl::parse_cid(cid, *this);
540}
541
542
543template<class T> size_t count_atom_sites(const T& obj, const Selection* sel=nullptr) {
544 size_t sum = 0;
545 if (!sel || sel->matches(obj))
546 for (const auto& child : obj.children())
547 sum += count_atom_sites(child, sel);
548 return sum;
549}
550template<> inline size_t count_atom_sites(const Atom& atom, const Selection* sel) {
551 return (!sel || sel->matches(atom)) ? 1 : 0;
552}
553
554template<class T> double count_occupancies(const T& obj, const Selection* sel=nullptr) {
555 double sum = 0;
556 if (!sel || sel->matches(obj))
557 for (const auto& child : obj.children())
558 sum += count_occupancies(child, sel);
559 return sum;
560}
561template<> inline double count_occupancies(const Atom& atom, const Selection* sel) {
562 return (!sel || sel->matches(atom)) ? atom.occ : 0;
563}
564
565} // namespace gemmi
566#endif
#define GEMMI_COLD
Definition fail.hpp:28
void vector_remove_if(std::vector< T > &v, F &&condition)
Definition util.hpp:266
El find_element(const char *symbol)
Definition elem.hpp:267
from_chars_result fast_from_chars(const char *start, const char *end, double &d)
Definition atof.hpp:16
double count_occupancies(const T &obj, const Selection *sel=nullptr)
Definition select.hpp:554
const char * element_name(El el)
Definition elem.hpp:195
std::string cat(Args const &... args)
Definition util.hpp:32
bool is_in_list(const std::string &name, const std::string &list, char sep=',')
Definition util.hpp:226
void cat_to(std::string &)
Definition util.hpp:25
GEMMI_DLL int GEMMI_DLL int std::string to_str(double d)
Definition sprintf.hpp:36
std::vector< std::string > split_str(const std::string &str, S sep)
Definition util.hpp:156
size_t count_atom_sites(const T &obj, const Selection *sel=nullptr)
Definition select.hpp:543
char alpha_up(char c)
Definition util.hpp:60
Represents atom site in macromolecular structure (~100 bytes).
Definition model.hpp:112
char altloc
Definition model.hpp:115
char flag
Definition model.hpp:119
float b_iso
Definition model.hpp:126
Element element
Definition model.hpp:117
float occ
Definition model.hpp:124
std::string name
Definition model.hpp:114
Residue * residue
Definition model.hpp:605
Chain * chain
Definition model.hpp:604
Atom * atom
Definition model.hpp:606
std::vector< Residue > residues
Definition model.hpp:465
std::string name
Definition model.hpp:464
int ordinal() const
Definition elem.hpp:301
std::vector< Chain > chains
Definition model.hpp:700
std::string name
Definition model.hpp:699
std::string name
Definition seqid.hpp:91
std::vector< Atom > atoms
Definition model.hpp:188
EntityType entity_type
Definition model.hpp:183
bool matches(const Atom &a) const
Definition select.hpp:95
std::string str() const
Definition select.hpp:108
bool has(char flag) const
Definition select.hpp:51
bool has(const std::string &name) const
Definition select.hpp:41
std::string str() const
Definition select.hpp:35
int compare(const SeqId &seqid) const
Definition select.hpp:81
std::string str() const
Definition select.hpp:68
Selection()=default
FlagList atom_flags
Definition select.hpp:129
Selection & set_atom_flags(const std::string &pattern)
Definition select.hpp:260
std::vector< AtomInequality > atom_inequalities
Definition select.hpp:130
FilterProxy< Selection, Atom > atoms(Residue &residue) const
Definition select.hpp:218
bool matches(const Residue &res) const
Definition select.hpp:188
T copy_selection(const T &orig) const
Definition select.hpp:266
bool matches(const Chain &chain) const
Definition select.hpp:185
bool matches(const Structure &) const
Definition select.hpp:181
bool matches(const Model &model) const
Definition select.hpp:182
CRA first_in_model(Model &model) const
Definition select.hpp:222
FilterProxy< Selection, Residue > residues(Chain &chain) const
Definition select.hpp:215
void remove_not_selected(T &t) const
Definition select.hpp:289
SequenceId from_seqid
Definition select.hpp:119
std::string str() const
Definition select.hpp:135
std::pair< Model *, CRA > first(Structure &st) const
Definition select.hpp:237
void add_matching_children(const T &orig, T &target) const
Definition select.hpp:247
FilterProxy< Selection, Model > models(Structure &st) const
Definition select.hpp:209
FlagList residue_flags
Definition select.hpp:128
void add_matching_children(const Atom &, Atom &) const
Definition select.hpp:254
Selection & set_residue_flags(const std::string &pattern)
Definition select.hpp:256
std::vector< char > elements
Definition select.hpp:126
void remove_not_selected(Atom &) const
Definition select.hpp:294
bool matches(const Atom &a) const
Definition select.hpp:195
FilterProxy< Selection, Chain > chains(Model &model) const
Definition select.hpp:212
void remove_selected(T &t) const
Definition select.hpp:273
bool matches(const CRA &cra) const
Definition select.hpp:203
std::array< char, 6 > et_flags
Definition select.hpp:124
void remove_selected(Residue &res) const
Definition select.hpp:280
SequenceId to_seqid
Definition select.hpp:120
OptionalNum num
Definition seqid.hpp:55
char icode
Definition seqid.hpp:56