Gemmi C++ API
Loading...
Searching...
No Matches
metadata.hpp
Go to the documentation of this file.
1// Copyright 2019 Global Phasing Ltd.
2//
3// Metadata from coordinate files.
4
5#ifndef GEMMI_METADATA_HPP_
6#define GEMMI_METADATA_HPP_
7
8#include <cstdint> // for uint8_t, uint16_t
9#include <algorithm> // for any_of
10#include <string>
11#include <vector>
12#include "math.hpp" // for Mat33
13#include "unitcell.hpp" // for Position, Asu
14#include "seqid.hpp" // for SeqId
15
16namespace gemmi {
17
18// corresponds to the mmCIF _software category
32
33// Information from REMARK 200/230 is significantly expanded in PDBx/mmCIF.
34// These remarks corresponds to data across 12 mmCIF categories
35// including categories _exptl, _reflns, _exptl_crystal, _diffrn and others.
36// _exptl and _reflns seem to be 1:1. Usually we have one experiment (_exptl),
37// except for a joint refinement (e.g. X-ray + neutron data).
38// Both crystal (_exptl_crystal) and reflection statistics (_reflns) can
39// be associated with multiple diffraction sets (_diffrn).
40// But if we use the PDB format, only one diffraction set per method
41// can be described.
42
44 double resolution_high = NAN; // _reflns.d_resolution_high
45 // (or _reflns_shell.d_res_high)
46 double resolution_low = NAN; // _reflns.d_resolution_low
47 double completeness = NAN; // _reflns.percent_possible_obs
48 double redundancy = NAN; // _reflns.pdbx_redundancy
49 double r_merge = NAN; // _reflns.pdbx_Rmerge_I_obs
50 double r_sym = NAN; // _reflns.pdbx_Rsym_value
51 double mean_I_over_sigma = NAN; // _reflns.pdbx_netI_over_sigmaI
52};
53
54// _exptl has no id, _exptl.method is key item and must be unique
56 std::string method; // _exptl.method
57 int number_of_crystals = -1; // _exptl.crystals_number
58 int unique_reflections = -1; // _reflns.number_obs
60 double b_wilson = NAN; // _reflns.B_iso_Wilson_estimate
61 std::vector<ReflectionsInfo> shells;
62 std::vector<std::string> diffraction_ids;
63};
64
66 std::string id; // _diffrn.id
67 double temperature = NAN; // _diffrn.ambient_temp
68 std::string source; // _diffrn_source.source
69 std::string source_type; // _diffrn_source.type
70 std::string synchrotron; // _diffrn_source.pdbx_synchrotron_site
71 std::string beamline; // _diffrn_source.pdbx_synchrotron_beamline
72 std::string wavelengths; // _diffrn_source.pdbx_wavelength
73 std::string scattering_type; // _diffrn_radiation.pdbx_scattering_type
74 char mono_or_laue = '\0'; // _diffrn_radiation.pdbx_monochromatic_or_laue_m_l
75 std::string monochromator; // _diffrn_radiation.monochromator
76 std::string collection_date; // _diffrn_detector.pdbx_collection_date
77 std::string optics; // _diffrn_detector.details
78 std::string detector; // _diffrn_detector.detector
79 std::string detector_make; // _diffrn_detector.type
80};
81
83 std::string id; // _exptl_crystal.id
84 std::string description; // _exptl_crystal.description
85 double ph = NAN; // _exptl_crystal_grow.pH
86 std::string ph_range; // _exptl_crystal_grow.pdbx_pH_range
87 std::vector<DiffractionInfo> diffractions;
88};
89
90
91struct TlsGroup {
92 struct Selection {
93 std::string chain;
96 std::string details; // _pdbx_refine_tls_group.selection_details
97 };
98 short num_id = -1; // id stored as number (optimization)
99 std::string id; // _pdbx_refine_tls.id
100 std::vector<Selection> selections;
101 Position origin; // _pdbx_refine_tls.origin_x/y/z
102 SMat33<double> T = {NAN, NAN, NAN, NAN, NAN, NAN}; // _pdbx_refine_tls.T[][]
103 SMat33<double> L = {NAN, NAN, NAN, NAN, NAN, NAN}; // _pdbx_refine_tls.L[][]
104 Mat33 S = Mat33{NAN}; // _pdbx_refine_tls.S[][]
105};
106
107// RefinementInfo corresponds to REMARK 3.
108// BasicRefinementInfo is used for both total and per-bin statistics.
109// For per-bin data, each values corresponds to one _refine_ls_shell.* tag.
111 double resolution_high = NAN; // _refine.ls_d_res_high, _refine_ls_shell.d_res_high
112 double resolution_low = NAN; // _refine.ls_d_res_low, _refine_ls_shell.d_res_low
113 double completeness = NAN; // _refine.ls_percent_reflns_obs, _refine_ls_shell.percent...
114 int reflection_count = -1; // _refine.ls_number_reflns_obs, _refine_ls_shell.number...
115 int work_set_count = -1; // _refine.ls_number_reflns_R_work, _refine_ls_shell.number...
116 int rfree_set_count = -1; // _refine.ls_number_reflns_R_free, _refine_ls_shell.number...
117 double r_all = NAN; // _refine.ls_R_factor_obs, _refine_ls_shell.R_factor_obs
118 double r_work = NAN; // _refine.ls_R_factor_R_work, _refine_ls_shell.R_factor_R_work
119 double r_free = NAN; // _refine.ls_R_factor_R_free, _refine_ls_shell.R_factor_R_free
120 double cc_fo_fc_work = NAN; // _refine.correlation_coeff_Fo_to_Fc, _refine_ls_shell.corr...
121 double cc_fo_fc_free = NAN; // _refine.correlation_coeff_Fo_to_Fc_free, _refine_ls_shell.c...
122 double fsc_work = NAN; // _refine.pdbx_average_fsc_work, _refine_ls_shell.pdbx_fsc_work
123 double fsc_free = NAN; // _refine.pdbx_average_fsc_free, _refine_ls_shell.pdbx_fsc_free
124 double cc_intensity_work = NAN; // _refine.correlation_coeff_I_to_Fcsqd_work, ...
125 double cc_intensity_free = NAN; // _refine.correlation_coeff_I_to_Fcsqd_free, ...
126};
127
129 struct Restr {
130 std::string name;
131 int count = -1;
132 double weight = NAN;
133 std::string function;
134 double dev_ideal = NAN;
135
136 Restr() = default;
137 explicit Restr(const std::string& name_) : name(name_) {}
138 };
139 std::string id;
140 std::string cross_validation_method; // _refine.pdbx_ls_cross_valid_method
141 std::string rfree_selection_method; // _refine.pdbx_R_Free_selection_details
142 int bin_count = -1; // _refine_ls_shell.pdbx_total_number_of_bins_used
143 std::vector<BasicRefinementInfo> bins;
144 double mean_b = NAN; // _refine.B_iso_mean
145 SMat33<double> aniso_b{NAN, NAN, NAN, NAN, NAN, NAN}; // _refine.aniso_B[][]
146 double luzzati_error = NAN; // _refine_analyze.Luzzati_coordinate_error_obs
147 double dpi_blow_r = NAN; // _refine.pdbx_overall_SU_R_Blow_DPI
148 double dpi_blow_rfree = NAN; // _refine.pdbx_overall_SU_R_free_Blow_DPI
149 double dpi_cruickshank_r = NAN; // _refine.overall_SU_R_Cruickshank_DPI
150 double dpi_cruickshank_rfree = NAN; // _refine.pdbx_overall_SU_R_free_Cruickshank_DPI
151 std::vector<Restr> restr_stats; // _refine_ls_restr
152 std::vector<TlsGroup> tls_groups; // _pdbx_refine_tls
153 std::string remarks;
154};
155
156
157struct Metadata {
158 std::vector<std::string> authors; // _audit_author.name
159 std::vector<ExperimentInfo> experiments;
160 std::vector<CrystalInfo> crystals;
161 std::vector<RefinementInfo> refinement;
162 std::vector<SoftwareItem> software;
163 std::string solved_by; // _refine.pdbx_method_to_determine_struct
164 std::string starting_model; // _refine.pdbx_starting_model
165 std::string remark_300_detail; // _struct_biol.details
166
167 bool has(double RefinementInfo::*field) const {
168 return std::any_of(refinement.begin(), refinement.end(),
169 [&](const RefinementInfo& r) { return !std::isnan(r.*field); });
170 }
172 return std::any_of(refinement.begin(), refinement.end(),
173 [&](const RefinementInfo& r) { return r.*field != -1; });
174 }
175 bool has(std::string RefinementInfo::*field) const {
176 return std::any_of(refinement.begin(), refinement.end(),
177 [&](const RefinementInfo& r) { return !(r.*field).empty(); });
178 }
180 return std::any_of(refinement.begin(), refinement.end(),
181 [&](const RefinementInfo& r) { return !std::isnan((r.*field).u11); });
182 }
183 bool has_restr() const {
184 return std::any_of(refinement.begin(), refinement.end(),
185 [&](const RefinementInfo& r) { return !r.restr_stats.empty(); });
186 }
187
188 // TLS constraint are not specific to refinement in joint refinement,
189 // so they are expected to be present only in a single RefinementInfo.
190 // As of 2025, two PDB entries have TLS + joint refinement: 6N3U and 5NKU.
191 std::vector<gemmi::TlsGroup>* get_tls_groups() {
192 for (gemmi::RefinementInfo& ref : refinement)
193 if (!ref.tls_groups.empty())
194 return &ref.tls_groups;
195 return nullptr;
196 }
197 const std::vector<gemmi::TlsGroup>* get_tls_groups() const {
198 return const_cast<Metadata*>(this)->get_tls_groups();
199 }
200};
201
202
203// Entity description.
204//
205// values corresponding to mmCIF _entity.type
206enum class EntityType : unsigned char {
207 Unknown,
208 Polymer,
210 Branched, // introduced in 2020
211 // _entity.type macrolide is in PDBx/mmCIF, but no PDB entry uses it
212 //Macrolide,
213 Water
214};
215
216// values corresponding to mmCIF _entity_poly.type
217enum class PolymerType : unsigned char {
218 Unknown, // unknown or not applicable
219 PeptideL, // polypeptide(L) in mmCIF (168923 values in the PDB in 2017)
220 PeptideD, // polypeptide(D) (57 values)
221 Dna, // polydeoxyribonucleotide (9905)
222 Rna, // polyribonucleotide (4559)
223 DnaRnaHybrid, // polydeoxyribonucleotide/polyribonucleotide hybrid (156)
224 SaccharideD, // polysaccharide(D) (18)
225 SaccharideL, // polysaccharide(L) (0)
226 Pna, // peptide nucleic acid (2)
227 CyclicPseudoPeptide, // cyclic-pseudo-peptide (1)
228 Other, // other (4)
229};
230
232 return pt == PolymerType::PeptideL || pt == PolymerType::PeptideD;
233}
234
236 return pt == PolymerType::Dna || pt == PolymerType::Rna ||
237 pt == PolymerType::DnaRnaHybrid;
238}
239
240struct Entity {
241 struct DbRef {
242 std::string db_name;
243 std::string accession_code;
244 std::string id_code;
245 std::string isoform; // pdbx_db_isoform
246 SeqId seq_begin, seq_end;
249 };
250 std::string name;
251 std::vector<std::string> subchains;
252 EntityType entity_type = EntityType::Unknown;
253 PolymerType polymer_type = PolymerType::Unknown;
254 // In case of microheterogeneity, PDB SEQRES has only the first residue name.
255 bool reflects_microhetero = false;
256 std::vector<DbRef> dbrefs;
258 std::vector<std::string> sifts_unp_acc;
260 std::vector<std::string> full_sequence;
261
262 Entity() = default;
263 explicit Entity(const std::string& name_) noexcept : name(name_) {}
264 static std::string first_mon(const std::string& mon_list) {
265 return mon_list.substr(0, mon_list.find(','));
266 }
267};
268
272 char res = '\0'; // _pdbx_sifts_xref_db.unp_res
273 std::uint8_t acc_index = 0; // index of Entity::sifts_unp_acc
274 std::uint16_t num = 0; // _pdbx_sifts_xref_db.unp_num
275};
276
277// A connection. Corresponds to _struct_conn.
278// Symmetry operators are not trusted and not stored.
279// We assume that the nearest symmetry mate is connected.
281 // in write_struct_conn() we assume that Unknown is at the end
282 enum Type : unsigned char { Covale=0, Disulf, Hydrog, MetalC, Unknown };
283 std::string name;
284 std::string link_id; // _struct_conn.ccp4_link_id (== _chem_link.id)
285 Type type = Unknown;
286 Asu asu = Asu::Any;
288 double reported_distance = 0.0;
289 short reported_sym[4] = {}; // don't rely on it, for internal use only
290};
291
292// Corresponds to CISPEP or _struct_mon_prot_cis
293struct CisPep {
295 int model_num = 0;
296 // mmCIF has (unused by the PDB) tag _struct_mon_prot_cis.label_alt_id
297 // that enables defining CIS link per conformation.
298 char only_altloc = '\0';
299 double reported_angle = NAN;
300};
301
302struct ModRes {
303 std::string chain_name;
305 std::string parent_comp_id;
306 std::string mod_id; // non-standard extension used in Refmac
307 std::string details;
308};
309
310// Secondary structure. PDBx/mmCIF stores helices and sheets separately.
311
312// mmCIF spec defines 32 possible values for _struct_conf.conf_type_id -
313// "the type of the conformation of the backbone of the polymer (whether
314// protein or nucleic acid)". But as of 2019 only HELX_P is used (not counting
315// TURN_P that occurs in only 6 entries). The actual helix type is given
316// by numeric value of _struct_conf.pdbx_PDB_helix_class, which corresponds
317// to helixClass from the PDB HELIX record. These values are in the range 1-10.
318// As of 2019 it's almost only type 1 and 5:
319// 3116566 of 1 - right-handed alpha
320// 16 of 2 - right-handed omega
321// 84 of 3 - right-handed pi
322// 79 of 4 - right-handed gamma
323// 1063337 of 5 - right-handed 3-10
324// 27 of 6 - left-handed alpha
325// 5 of 7 - left-handed omega
326// 2 of 8 - left-handed gamma
327// 8 of 9 - 2-7 ribbon/helix
328// 46 of 10 - polyproline
329struct Helix {
331 UnknownHelix, RAlpha, ROmega, RPi, RGamma, R310,
332 LAlpha, LOmega, LGamma, Helix27, HelixPolyProlineNone
333 };
335 HelixClass pdb_helix_class = UnknownHelix;
336 int length = -1;
338 if (n >= 1 && n <= 10)
339 pdb_helix_class = static_cast<HelixClass>(n);
340 }
341};
342
343struct Sheet {
344 struct Strand {
347 int sense; // 0 = first strand, 1 = parallel, -1 = anti-parallel.
348 std::string name; // optional, _struct_sheet_range.id if from mmCIF
349 };
350 std::string name;
351 std::vector<Strand> strands;
352
353 Sheet() = default;
354 explicit Sheet(const std::string& sheet_id) noexcept : name(sheet_id) {}
355};
356
357
358// bioassembly / biomolecule
359struct Assembly {
360 struct Operator {
361 std::string name; // optional
362 std::string type; // optional (from mmCIF only)
364 };
365 struct Gen {
366 std::vector<std::string> chains;
367 std::vector<std::string> subchains;
368 std::vector<Operator> operators;
369 };
370 enum class SpecialKind : unsigned char {
371 NA, CompleteIcosahedral, RepresentativeHelical, CompletePoint
372 };
373 std::string name;
374 bool author_determined = false;
375 bool software_determined = false;
376 SpecialKind special_kind = SpecialKind::NA;
377 int oligomeric_count = 0;
379 std::string software_name;
380 double absa = NAN; // TOTAL BURIED SURFACE AREA: ... ANGSTROM**2
381 double ssa = NAN; // SURFACE AREA OF THE COMPLEX: ... ANGSTROM**2
382 double more = NAN; // CHANGE IN SOLVENT FREE ENERGY: ... KCAL/MOL
383 std::vector<Gen> generators;
384
385 Assembly() = default;
386 explicit Assembly(const std::string& name_) : name(name_) {}
387};
388
389} // namespace gemmi
390#endif
Math utilities. 3D linear algebra.
bool is_polypeptide(PolymerType pt)
Definition metadata.hpp:231
bool is_polynucleotide(PolymerType pt)
Definition metadata.hpp:235
Definition seqid.hpp:151
SeqId – residue number and insertion code together.
std::vector< std::string > subchains
Definition metadata.hpp:367
std::vector< std::string > chains
Definition metadata.hpp:366
std::vector< Operator > operators
Definition metadata.hpp:368
std::string name
Definition metadata.hpp:373
Assembly()=default
std::string oligomeric_details
Definition metadata.hpp:378
std::vector< Gen > generators
Definition metadata.hpp:383
std::string software_name
Definition metadata.hpp:379
Assembly(const std::string &name_)
Definition metadata.hpp:386
AtomAddress partner_c
Definition metadata.hpp:294
AtomAddress partner1
Definition metadata.hpp:287
std::string name
Definition metadata.hpp:283
std::string link_id
Definition metadata.hpp:284
std::vector< DiffractionInfo > diffractions
Definition metadata.hpp:87
std::string ph_range
Definition metadata.hpp:86
std::string id
Definition metadata.hpp:83
std::string description
Definition metadata.hpp:84
std::string monochromator
Definition metadata.hpp:75
std::string synchrotron
Definition metadata.hpp:70
std::string wavelengths
Definition metadata.hpp:72
std::string collection_date
Definition metadata.hpp:76
std::string scattering_type
Definition metadata.hpp:73
std::string detector_make
Definition metadata.hpp:79
std::string source_type
Definition metadata.hpp:69
std::string db_name
Definition metadata.hpp:242
std::string id_code
Definition metadata.hpp:244
std::string isoform
Definition metadata.hpp:245
std::string accession_code
Definition metadata.hpp:243
SeqId::OptionalNum label_seq_begin
Definition metadata.hpp:248
std::vector< std::string > sifts_unp_acc
List of SIFTS Uniprot ACs referenced by SiftsUnpResidue::acc_index.
Definition metadata.hpp:258
std::vector< std::string > subchains
Definition metadata.hpp:251
std::string name
Definition metadata.hpp:250
std::vector< std::string > full_sequence
SEQRES or entity_poly_seq with microheterogeneity as comma-separated names.
Definition metadata.hpp:260
Entity(const std::string &name_) noexcept
Definition metadata.hpp:263
static std::string first_mon(const std::string &mon_list)
Definition metadata.hpp:264
std::vector< DbRef > dbrefs
Definition metadata.hpp:256
Entity()=default
std::vector< std::string > diffraction_ids
Definition metadata.hpp:62
std::vector< ReflectionsInfo > shells
Definition metadata.hpp:61
ReflectionsInfo reflections
Definition metadata.hpp:59
void set_helix_class_as_int(int n)
Definition metadata.hpp:337
AtomAddress end
Definition metadata.hpp:334
std::vector< SoftwareItem > software
Definition metadata.hpp:162
std::vector< CrystalInfo > crystals
Definition metadata.hpp:160
bool has(std::string RefinementInfo::*field) const
Definition metadata.hpp:175
std::vector< ExperimentInfo > experiments
Definition metadata.hpp:159
bool has(int RefinementInfo::*field) const
Definition metadata.hpp:171
std::string starting_model
Definition metadata.hpp:164
std::string remark_300_detail
Definition metadata.hpp:165
bool has(SMat33< double > RefinementInfo::*field) const
Definition metadata.hpp:179
bool has_restr() const
Definition metadata.hpp:183
bool has(double RefinementInfo::*field) const
Definition metadata.hpp:167
const std::vector< gemmi::TlsGroup > * get_tls_groups() const
Definition metadata.hpp:197
std::vector< RefinementInfo > refinement
Definition metadata.hpp:161
std::string solved_by
Definition metadata.hpp:163
std::vector< gemmi::TlsGroup > * get_tls_groups()
Definition metadata.hpp:191
std::vector< std::string > authors
Definition metadata.hpp:158
std::string parent_comp_id
Definition metadata.hpp:305
std::string details
Definition metadata.hpp:307
std::string chain_name
Definition metadata.hpp:303
std::string mod_id
Definition metadata.hpp:306
ResidueId res_id
Definition metadata.hpp:304
Coordinates in Angstroms - orthogonal (Cartesian) coordinates.
Definition unitcell.hpp:32
Restr(const std::string &name_)
Definition metadata.hpp:137
std::string rfree_selection_method
Definition metadata.hpp:141
std::string cross_validation_method
Definition metadata.hpp:140
std::vector< BasicRefinementInfo > bins
Definition metadata.hpp:143
SMat33< double > aniso_b
Definition metadata.hpp:145
std::vector< TlsGroup > tls_groups
Definition metadata.hpp:152
std::vector< Restr > restr_stats
Definition metadata.hpp:151
AtomAddress hbond_atom1
Definition metadata.hpp:346
std::vector< Strand > strands
Definition metadata.hpp:351
std::string name
Definition metadata.hpp:350
Sheet(const std::string &sheet_id) noexcept
Definition metadata.hpp:354
Sheet()=default
Reference to UniProt residue, based on _pdbx_sifts_xref_db.
Definition metadata.hpp:271
Classification classification
Definition metadata.hpp:30
std::string date
Definition metadata.hpp:26
std::string description
Definition metadata.hpp:27
std::string version
Definition metadata.hpp:25
std::string name
Definition metadata.hpp:24
std::string contact_author
Definition metadata.hpp:28
std::string contact_author_email
Definition metadata.hpp:29
std::string id
Definition metadata.hpp:99
SMat33< double > L
Definition metadata.hpp:103
std::vector< Selection > selections
Definition metadata.hpp:100
SMat33< double > T
Definition metadata.hpp:102
Position origin
Definition metadata.hpp:101
Unit cell.