Gemmi C++ API
Loading...
Searching...
No Matches
metadata.hpp
Go to the documentation of this file.
1// Copyright 2019 Global Phasing Ltd.
2//
3// Metadata from coordinate files.
4
5#ifndef GEMMI_METADATA_HPP_
6#define GEMMI_METADATA_HPP_
7
8#include <cstdint> // for uint8_t, uint16_t
9#include <algorithm> // for any_of
10#include <string>
11#include <vector>
12#include "math.hpp" // for Mat33
13#include "unitcell.hpp" // for Position, Asu
14#include "seqid.hpp" // for SeqId
15
16namespace gemmi {
17
18// corresponds to the mmCIF _software category
29
30// Information from REMARK 200/230 is significantly expanded in PDBx/mmCIF.
31// These remarks corresponds to data across 12 mmCIF categories
32// including categories _exptl, _reflns, _exptl_crystal, _diffrn and others.
33// _exptl and _reflns seem to be 1:1. Usually we have one experiment (_exptl),
34// except for a joint refinement (e.g. X-ray + neutron data).
35// Both crystal (_exptl_crystal) and reflection statistics (_reflns) can
36// be associated with multiple diffraction sets (_diffrn).
37// But if we use the PDB format, only one diffraction set per method
38// can be described.
39
41 double resolution_high = NAN; // _reflns.d_resolution_high
42 // (or _reflns_shell.d_res_high)
43 double resolution_low = NAN; // _reflns.d_resolution_low
44 double completeness = NAN; // _reflns.percent_possible_obs
45 double redundancy = NAN; // _reflns.pdbx_redundancy
46 double r_merge = NAN; // _reflns.pdbx_Rmerge_I_obs
47 double r_sym = NAN; // _reflns.pdbx_Rsym_value
48 double mean_I_over_sigma = NAN; // _reflns.pdbx_netI_over_sigmaI
49};
50
51// _exptl has no id, _exptl.method is key item and must be unique
53 std::string method; // _exptl.method
54 int number_of_crystals = -1; // _exptl.crystals_number
55 int unique_reflections = -1; // _reflns.number_obs
57 double b_wilson = NAN; // _reflns.B_iso_Wilson_estimate
58 std::vector<ReflectionsInfo> shells;
59 std::vector<std::string> diffraction_ids;
60};
61
63 std::string id; // _diffrn.id
64 double temperature = NAN; // _diffrn.ambient_temp
65 std::string source; // _diffrn_source.source
66 std::string source_type; // _diffrn_source.type
67 std::string synchrotron; // _diffrn_source.pdbx_synchrotron_site
68 std::string beamline; // _diffrn_source.pdbx_synchrotron_beamline
69 std::string wavelengths; // _diffrn_source.pdbx_wavelength
70 std::string scattering_type; // _diffrn_radiation.pdbx_scattering_type
71 char mono_or_laue = '\0'; // _diffrn_radiation.pdbx_monochromatic_or_laue_m_l
72 std::string monochromator; // _diffrn_radiation.monochromator
73 std::string collection_date; // _diffrn_detector.pdbx_collection_date
74 std::string optics; // _diffrn_detector.details
75 std::string detector; // _diffrn_detector.detector
76 std::string detector_make; // _diffrn_detector.type
77};
78
80 std::string id; // _exptl_crystal.id
81 std::string description; // _exptl_crystal.description
82 double ph = NAN; // _exptl_crystal_grow.pH
83 std::string ph_range; // _exptl_crystal_grow.pdbx_pH_range
84 std::vector<DiffractionInfo> diffractions;
85};
86
87
88struct TlsGroup {
89 struct Selection {
90 std::string chain;
93 std::string details; // _pdbx_refine_tls_group.selection_details
94 };
95 std::string id; // _pdbx_refine_tls.id
96 std::vector<Selection> selections;
97 Position origin; // _pdbx_refine_tls.origin_x/y/z
98 Mat33 T; // _pdbx_refine_tls.T[][]
99 Mat33 L; // _pdbx_refine_tls.L[][]
100 Mat33 S; // _pdbx_refine_tls.S[][]
101};
102
103// RefinementInfo corresponds to REMARK 3.
104// BasicRefinementInfo is used for both total and per-bin statistics.
105// For per-bin data, each values corresponds to one _refine_ls_shell.* tag.
107 double resolution_high = NAN; // _refine.ls_d_res_high
108 double resolution_low = NAN; // _refine.ls_d_res_low
109 double completeness = NAN; // _refine.ls_percent_reflns_obs
110 int reflection_count = -1; // _refine.ls_number_reflns_obs
111 int rfree_set_count = -1; // _refine.ls_number_reflns_R_free
112 double r_all = NAN; // _refine.ls_R_factor_obs
113 double r_work = NAN; // _refine.ls_R_factor_R_work
114 double r_free = NAN; // _refine.ls_R_factor_R_free
115};
116
118 struct Restr {
119 std::string name;
120 int count = -1;
121 double weight = NAN;
122 std::string function;
123 double dev_ideal = NAN;
124 Restr(const std::string& name_) : name(name_) {}
125 };
126 std::string id;
127 std::string cross_validation_method; // _refine.pdbx_ls_cross_valid_method
128 std::string rfree_selection_method; // _refine.pdbx_R_Free_selection_details
129 int bin_count = -1; // _refine_ls_shell.pdbx_total_number_of_bins_used
130 std::vector<BasicRefinementInfo> bins;
131 double mean_b = NAN; // _refine.B_iso_mean
132 Mat33 aniso_b{NAN}; // _refine.aniso_B[][]
133 double luzzati_error = NAN; // _refine_analyze.Luzzati_coordinate_error_obs
134 double dpi_blow_r = NAN; // _refine.pdbx_overall_SU_R_Blow_DPI
135 double dpi_blow_rfree = NAN; // _refine.pdbx_overall_SU_R_free_Blow_DPI
136 double dpi_cruickshank_r = NAN; // _refine.overall_SU_R_Cruickshank_DPI
137 double dpi_cruickshank_rfree = NAN; // _refine.pdbx_overall_SU_R_free_Cruickshank_DPI
138 double cc_fo_fc = NAN; // _refine.correlation_coeff_Fo_to_Fc
139 double cc_fo_fc_free = NAN; // _refine.correlation_coeff_Fo_to_Fc_free
140 std::vector<Restr> restr_stats; // _refine_ls_restr
141 std::vector<TlsGroup> tls_groups; // _pdbx_refine_tls
142 std::string remarks;
143};
144
145
146struct Metadata {
147 std::vector<std::string> authors; // _audit_author.name
148 std::vector<ExperimentInfo> experiments;
149 std::vector<CrystalInfo> crystals;
150 std::vector<RefinementInfo> refinement;
151 std::vector<SoftwareItem> software;
152 std::string solved_by; // _refine.pdbx_method_to_determine_struct
153 std::string starting_model; // _refine.pdbx_starting_model
154 std::string remark_300_detail; // _struct_biol.details
155
156 bool has(double RefinementInfo::*field) const {
157 return std::any_of(refinement.begin(), refinement.end(),
158 [&](const RefinementInfo& r) { return !std::isnan(r.*field); });
159 }
161 return std::any_of(refinement.begin(), refinement.end(),
162 [&](const RefinementInfo& r) { return r.*field != -1; });
163 }
164 bool has(std::string RefinementInfo::*field) const {
165 return std::any_of(refinement.begin(), refinement.end(),
166 [&](const RefinementInfo& r) { return !(r.*field).empty(); });
167 }
169 return std::any_of(refinement.begin(), refinement.end(),
170 [&](const RefinementInfo& r) { return !std::isnan((r.*field)[0][0]); });
171 }
172 bool has_restr() const {
173 return std::any_of(refinement.begin(), refinement.end(),
174 [&](const RefinementInfo& r) { return !r.restr_stats.empty(); });
175 }
176 bool has_tls() const {
177 return std::any_of(refinement.begin(), refinement.end(),
178 [&](const RefinementInfo& r) { return !r.tls_groups.empty(); });
179 }
180};
181
182
183// Entity description.
184//
185// values corresponding to mmCIF _entity.type
186enum class EntityType : unsigned char {
187 Unknown,
188 Polymer,
190 Branched, // introduced in 2020
191 // _entity.type macrolide is in PDBx/mmCIF, but no PDB entry uses it
192 //Macrolide,
193 Water
194};
195
196// values corresponding to mmCIF _entity_poly.type
197enum class PolymerType : unsigned char {
198 Unknown, // unknown or not applicable
199 PeptideL, // polypeptide(L) in mmCIF (168923 values in the PDB in 2017)
200 PeptideD, // polypeptide(D) (57 values)
201 Dna, // polydeoxyribonucleotide (9905)
202 Rna, // polyribonucleotide (4559)
203 DnaRnaHybrid, // polydeoxyribonucleotide/polyribonucleotide hybrid (156)
204 SaccharideD, // polysaccharide(D) (18)
205 SaccharideL, // polysaccharide(L) (0)
206 Pna, // peptide nucleic acid (2)
207 CyclicPseudoPeptide, // cyclic-pseudo-peptide (1)
208 Other, // other (4)
209};
210
212 return pt == PolymerType::PeptideL || pt == PolymerType::PeptideD;
213}
214
216 return pt == PolymerType::Dna || pt == PolymerType::Rna ||
217 pt == PolymerType::DnaRnaHybrid;
218}
219
220struct Entity {
221 struct DbRef {
222 std::string db_name;
223 std::string accession_code;
224 std::string id_code;
225 std::string isoform; // pdbx_db_isoform
226 SeqId seq_begin, seq_end;
229 };
230 std::string name;
231 std::vector<std::string> subchains;
232 EntityType entity_type = EntityType::Unknown;
233 PolymerType polymer_type = PolymerType::Unknown;
234 std::vector<DbRef> dbrefs;
236 std::vector<std::string> sifts_unp_acc;
238 std::vector<std::string> full_sequence;
239
240 explicit Entity(std::string name_) noexcept : name(name_) {}
241 static std::string first_mon(const std::string& mon_list) {
242 return mon_list.substr(0, mon_list.find(','));
243 }
244};
245
249 char res = '\0'; // _pdbx_sifts_xref_db.unp_res
250 std::uint8_t acc_index = 0; // index of Entity::sifts_unp_acc
251 std::uint16_t num = 0; // _pdbx_sifts_xref_db.unp_num
252};
253
254// A connection. Corresponds to _struct_conn.
255// Symmetry operators are not trusted and not stored.
256// We assume that the nearest symmetry mate is connected.
258 // in write_struct_conn() we assume that Unknown is at the end
259 enum Type { Covale=0, Disulf, Hydrog, MetalC, Unknown };
260 std::string name;
261 std::string link_id; // _struct_conn.ccp4_link_id (== _chem_link.id)
262 Type type = Unknown;
263 Asu asu = Asu::Any;
265 double reported_distance = 0.0;
266};
267
268// Corresponds to CISPEP or _struct_mon_prot_cis
269struct CisPep {
271 std::string model_str;
272 // mmCIF has (unused by the PDB) tag _struct_mon_prot_cis.label_alt_id
273 // that enables defining CIS link per conformation.
274 char only_altloc = '\0';
275 double reported_angle = NAN;
276};
277
278struct ModRes {
279 std::string chain_name;
281 std::string parent_comp_id;
282 std::string mod_id; // non-standard extension used in Refmac
283 std::string details;
284};
285
286// Secondary structure. PDBx/mmCIF stores helices and sheets separately.
287
288// mmCIF spec defines 32 possible values for _struct_conf.conf_type_id -
289// "the type of the conformation of the backbone of the polymer (whether
290// protein or nucleic acid)". But as of 2019 only HELX_P is used (not counting
291// TURN_P that occurs in only 6 entries). The actual helix type is given
292// by numeric value of _struct_conf.pdbx_PDB_helix_class, which corresponds
293// to helixClass from the PDB HELIX record. These values are in the range 1-10.
294// As of 2019 it's almost only type 1 and 5:
295// 3116566 of 1 - right-handed alpha
296// 16 of 2 - right-handed omega
297// 84 of 3 - right-handed pi
298// 79 of 4 - right-handed gamma
299// 1063337 of 5 - right-handed 3-10
300// 27 of 6 - left-handed alpha
301// 5 of 7 - left-handed omega
302// 2 of 8 - left-handed gamma
303// 8 of 9 - 2-7 ribbon/helix
304// 46 of 10 - polyproline
305struct Helix {
307 UnknownHelix, RAlpha, ROmega, RPi, RGamma, R310,
308 LAlpha, LOmega, LGamma, Helix27, HelixPolyProlineNone
309 };
311 HelixClass pdb_helix_class = UnknownHelix;
312 int length = -1;
314 if (n >= 1 && n <= 10)
315 pdb_helix_class = static_cast<HelixClass>(n);
316 }
317};
318
319struct Sheet {
320 struct Strand {
323 int sense; // 0 = first strand, 1 = parallel, -1 = anti-parallel.
324 std::string name; // optional, _struct_sheet_range.id if from mmCIF
325 };
326 std::string name;
327 std::vector<Strand> strands;
328 explicit Sheet(std::string sheet_id) noexcept : name(sheet_id) {}
329};
330
331
332// bioassembly / biomolecule
333struct Assembly {
334 struct Operator {
335 std::string name; // optional
336 std::string type; // optional (from mmCIF only)
338 };
339 struct Gen {
340 std::vector<std::string> chains;
341 std::vector<std::string> subchains;
342 std::vector<Operator> operators;
343 };
344 enum class SpecialKind {
345 NA, CompleteIcosahedral, RepresentativeHelical, CompletePoint
346 };
347 std::string name;
348 bool author_determined = false;
349 bool software_determined = false;
350 SpecialKind special_kind = SpecialKind::NA;
351 int oligomeric_count = 0;
353 std::string software_name;
354 double absa = NAN; // TOTAL BURIED SURFACE AREA: ... ANGSTROM**2
355 double ssa = NAN; // SURFACE AREA OF THE COMPLEX: ... ANGSTROM**2
356 double more = NAN; // CHANGE IN SOLVENT FREE ENERGY: ... KCAL/MOL
357 std::vector<Gen> generators;
358 Assembly(const std::string& name_) : name(name_) {}
359};
360
361} // namespace gemmi
362#endif
bool is_polypeptide(PolymerType pt)
Definition metadata.hpp:211
bool is_polynucleotide(PolymerType pt)
Definition metadata.hpp:215
Definition seqid.hpp:149
std::vector< std::string > subchains
Definition metadata.hpp:341
std::vector< std::string > chains
Definition metadata.hpp:340
std::vector< Operator > operators
Definition metadata.hpp:342
std::string name
Definition metadata.hpp:347
std::string oligomeric_details
Definition metadata.hpp:352
std::vector< Gen > generators
Definition metadata.hpp:357
std::string software_name
Definition metadata.hpp:353
Assembly(const std::string &name_)
Definition metadata.hpp:358
AtomAddress partner_c
Definition metadata.hpp:270
std::string model_str
Definition metadata.hpp:271
AtomAddress partner1
Definition metadata.hpp:264
std::string name
Definition metadata.hpp:260
std::string link_id
Definition metadata.hpp:261
std::vector< DiffractionInfo > diffractions
Definition metadata.hpp:84
std::string ph_range
Definition metadata.hpp:83
std::string id
Definition metadata.hpp:80
std::string description
Definition metadata.hpp:81
std::string monochromator
Definition metadata.hpp:72
std::string synchrotron
Definition metadata.hpp:67
std::string wavelengths
Definition metadata.hpp:69
std::string collection_date
Definition metadata.hpp:73
std::string scattering_type
Definition metadata.hpp:70
std::string detector_make
Definition metadata.hpp:76
std::string source_type
Definition metadata.hpp:66
std::string db_name
Definition metadata.hpp:222
std::string id_code
Definition metadata.hpp:224
std::string isoform
Definition metadata.hpp:225
std::string accession_code
Definition metadata.hpp:223
SeqId::OptionalNum label_seq_begin
Definition metadata.hpp:228
Entity(std::string name_) noexcept
Definition metadata.hpp:240
std::vector< std::string > sifts_unp_acc
List of SIFTS Uniprot ACs referenced by SiftsUnpResidue::acc_index.
Definition metadata.hpp:236
std::vector< std::string > subchains
Definition metadata.hpp:231
std::string name
Definition metadata.hpp:230
std::vector< std::string > full_sequence
SEQRES or entity_poly_seq with microheterogeneity as comma-separated names.
Definition metadata.hpp:238
static std::string first_mon(const std::string &mon_list)
Definition metadata.hpp:241
std::vector< DbRef > dbrefs
Definition metadata.hpp:234
std::vector< std::string > diffraction_ids
Definition metadata.hpp:59
std::vector< ReflectionsInfo > shells
Definition metadata.hpp:58
ReflectionsInfo reflections
Definition metadata.hpp:56
void set_helix_class_as_int(int n)
Definition metadata.hpp:313
AtomAddress end
Definition metadata.hpp:310
std::vector< SoftwareItem > software
Definition metadata.hpp:151
std::vector< CrystalInfo > crystals
Definition metadata.hpp:149
bool has(std::string RefinementInfo::*field) const
Definition metadata.hpp:164
std::vector< ExperimentInfo > experiments
Definition metadata.hpp:148
bool has(int RefinementInfo::*field) const
Definition metadata.hpp:160
std::string starting_model
Definition metadata.hpp:153
std::string remark_300_detail
Definition metadata.hpp:154
bool has_restr() const
Definition metadata.hpp:172
bool has(double RefinementInfo::*field) const
Definition metadata.hpp:156
bool has(Mat33 RefinementInfo::*field) const
Definition metadata.hpp:168
bool has_tls() const
Definition metadata.hpp:176
std::vector< RefinementInfo > refinement
Definition metadata.hpp:150
std::string solved_by
Definition metadata.hpp:152
std::vector< std::string > authors
Definition metadata.hpp:147
std::string parent_comp_id
Definition metadata.hpp:281
std::string details
Definition metadata.hpp:283
std::string chain_name
Definition metadata.hpp:279
std::string mod_id
Definition metadata.hpp:282
ResidueId res_id
Definition metadata.hpp:280
Coordinates in Angstroms - orthogonal (Cartesian) coordinates.
Definition unitcell.hpp:32
Restr(const std::string &name_)
Definition metadata.hpp:124
std::string rfree_selection_method
Definition metadata.hpp:128
std::string cross_validation_method
Definition metadata.hpp:127
std::vector< BasicRefinementInfo > bins
Definition metadata.hpp:130
std::vector< TlsGroup > tls_groups
Definition metadata.hpp:141
std::vector< Restr > restr_stats
Definition metadata.hpp:140
AtomAddress hbond_atom1
Definition metadata.hpp:322
std::vector< Strand > strands
Definition metadata.hpp:327
Sheet(std::string sheet_id) noexcept
Definition metadata.hpp:328
std::string name
Definition metadata.hpp:326
Reference to UniProt residue, based on _pdbx_sifts_xref_db.
Definition metadata.hpp:248
Classification classification
Definition metadata.hpp:27
std::string date
Definition metadata.hpp:26
std::string version
Definition metadata.hpp:25
std::string name
Definition metadata.hpp:24
std::string id
Definition metadata.hpp:95
std::vector< Selection > selections
Definition metadata.hpp:96
Position origin
Definition metadata.hpp:97