Gemmi C++ API
Loading...
Searching...
No Matches
pdb.hpp
Go to the documentation of this file.
1// Copyright 2017 Global Phasing Ltd.
2//
3// Read PDB file format and store it in Structure.
4//
5// Based on the format spec:
6// https://www.wwpdb.org/documentation/file-format-content/format33/v3.3.html
7// + support for two-character chain IDs (columns 21 and 22)
8// + read segment ID (columns 73-76)
9// + read hybrid-36 serial numbers (http://cci.lbl.gov/hybrid_36/)
10// + hybrid-36 sequence id for sequences longer than 9999 (no such examples)
11// + allow for longer REMARK lines (up to 120 characters)
12
13#ifndef GEMMI_PDB_HPP_
14#define GEMMI_PDB_HPP_
15
16#include <algorithm> // for min, swap
17#include <cctype> // for isalpha
18#include <cstdio> // for stdin, size_t
19#include <cstdlib> // for strtol
20#include <cstring> // for memcpy, strstr, strchr
21#include <unordered_map>
22
23#include "fileutil.hpp" // for path_basename, file_open
24#include "input.hpp" // for FileStream
25#include "model.hpp" // for Atom, Structure, ...
26#include "polyheur.hpp" // for assign_subchains
27#include "remarks.hpp" // for read_metadata_from_remarks, read_int, ...
28
29namespace gemmi {
30
31namespace pdb_impl {
32
33template<int N> int read_base36(const char* p) {
34 char zstr[N+1] = {0};
35 std::memcpy(zstr, p, N);
36 return std::strtol(zstr, nullptr, 36);
37}
38
39// Compare the first 4 letters of s, ignoring case, with uppercase record.
40// Both args must have at least 3+1 chars. ' ' and NUL are equivalent in s.
41inline bool is_record_type(const char* s, const char* record) {
42 return ialpha4_id(s) == ialpha4_id(record);
43}
44// for record "TER": "TER ", TER\n, TER\r, TER\t match, TERE, TER1 don't
45inline bool is_record_type3(const char* s, const char* record) {
46 return (ialpha4_id(s) & ~0xf) == ialpha4_id(record);
47}
48
49// The standard charge format is 2+, but some files have +2.
50inline signed char read_charge(char digit, char sign) {
51 if (sign == ' ' && digit == ' ') // by far the most common case
52 return 0;
53 if (sign >= '0' && sign <= '9')
54 std::swap(digit, sign);
55 if (digit >= '0' && digit <= '9') {
56 if (sign != '+' && sign != '-' && sign != '\0' && !is_space(sign))
57 fail("Wrong format for charge: " +
58 std::string(1, digit) + std::string(1, sign));
59 return (digit - '0') * (sign == '-' ? -1 : 1);
60 }
61 // if we are here the field should be blank, but maybe better not to check
62 return 0;
63}
64
65inline int read_matrix(Transform& t, char* line, size_t len) {
66 if (len < 46)
67 return 0;
68 char n = line[5] - '0';
69 if (n >= 1 && n <= 3) {
70 t.mat[n-1][0] = read_double(line+10, 10);
71 t.mat[n-1][1] = read_double(line+20, 10);
72 t.mat[n-1][2] = read_double(line+30, 10);
73 t.vec.at(n-1) = read_double(line+45, 10);
74 }
75 return n;
76}
77
78inline SeqId read_seq_id(const char* str) {
79 SeqId seqid;
80 if (str[4] != '\r' && str[4] != '\n')
81 seqid.icode = str[4];
82 // We support hybrid-36 extension, although it is never used in practice
83 // as 9999 residues per chain are enough.
84 if (str[0] < 'A') {
85 for (int i = 4; i != 0; --i, ++str)
86 if (!is_space(*str)) {
87 seqid.num = read_int(str, i);
88 break;
89 }
90 } else {
91 seqid.num = read_base36<4>(str) - 466560 + 10000;
92 }
93 return seqid;
94}
95
96inline ResidueId read_res_id(const char* seq_id, const char* name) {
97 return {read_seq_id(seq_id), {}, read_string(name, 3)};
98}
99
100inline char read_altloc(char c) { return c == ' ' ? '\0' : c; }
101
102inline int read_serial(const char* ptr) {
103 return ptr[0] < 'A' ? read_int(ptr, 5)
104 : read_base36<5>(ptr) - 16796160 + 100000;
105}
106
107// move initials after comma, as in mmCIF (A.-B.DOE -> DOE, A.-B.), see
108// https://www.wwpdb.org/documentation/file-format-content/format33/sect2.html#AUTHOR
109inline void change_author_name_format_to_mmcif(std::string& name) {
110 // If the AUTHOR record has comma followed by space we get leading space here
111 while (name[0] == ' ')
112 name.erase(name.begin());
113 size_t pos = 0;
114 // Initials may have multiple letters (e.g. JU. or PON.)
115 // but should not have space after dot.
116 for (size_t i = 1; i < pos+4 && i+1 < name.size(); ++i)
117 if (name[i] == '.' && name[i+1] != ' ')
118 pos = i+1;
119 if (pos > 0)
120 name = name.substr(pos) + ", " + name.substr(0, pos);
121}
122
123inline Asu compare_link_symops(const std::string& record) {
124 if (record.size() < 72)
125 return Asu::Any; // it could be interpreted as Same
126 if (read_string(&record[59], 6) == read_string(&record[66], 6))
127 return Asu::Same;
128 return Asu::Different;
129}
130
131// Atom name and altloc are not provided in the SSBOND record.
132// Usually it is SG (cysteine), but other disulfide bonds are also possible.
133// If it's not SG, we pick the first sulfur atom in the residue.
134inline void complete_ssbond_atom(AtomAddress& ad, const Model& mdl) {
135 ad.atom_name = "SG";
136 const_CRA cra = mdl.find_cra(ad);
137 if (cra.residue && (!cra.atom || cra.atom->element != El::S))
138 if (const Atom* a = cra.residue->find_by_element(El::S)) {
139 ad.atom_name = a->name;
140 ad.altloc = a->altloc;
141 }
142}
143
144inline
145void process_conn(Structure& st, const std::vector<std::string>& conn_records) {
146 int disulf_count = 0;
147 int covale_count = 0;
148 int metalc_count = 0;
149 for (const std::string& record : conn_records) {
150 if (record[0] == 'S' || record[0] == 's') { // SSBOND
151 if (record.length() < 32)
152 continue;
153 Connection c;
154 c.name = "disulf" + std::to_string(++disulf_count);
155 c.type = Connection::Disulf;
156 const char* r = record.c_str();
157 c.partner1.chain_name = read_string(r + 14, 2);
158 c.partner1.res_id = read_res_id(r + 17, r + 11);
159 c.partner2.chain_name = read_string(r + 28, 2);
160 char res_id2[5] = {' ', ' ', ' ', ' ', ' '};
161 std::memcpy(res_id2, r + 31, std::min((size_t)5, record.length() - 31));
162 c.partner2.res_id = read_res_id(res_id2, r + 25);
163 c.asu = compare_link_symops(record);
164 if (record.length() > 73)
165 c.reported_distance = read_double(r + 73, 5);
166 complete_ssbond_atom(c.partner1, st.first_model());
167 complete_ssbond_atom(c.partner2, st.models[0]);
168 st.connections.emplace_back(c);
169 } else if (record[0] == 'L' || record[0] == 'l') { // LINK
170 if (record.length() < 57)
171 continue;
172 Connection c;
173 // emulating names used in wwPDB mmCIFs (covaleN and metalcN)
174 if (is_metal(find_element(&record[12])) ||
175 is_metal(find_element(&record[42]))) {
176 c.name = "metalc" + std::to_string(++metalc_count);
177 c.type = Connection::MetalC;
178 } else {
179 c.name = "covale" + std::to_string(++covale_count);
180 c.type = Connection::Covale;
181 }
182 for (int i : {0, 1}) {
183 const char* t = record.c_str() + 30 * i;
184 AtomAddress& ad = (i == 0 ? c.partner1 : c.partner2);
185 ad.chain_name = read_string(t + 20, 2);
186 ad.res_id = read_res_id(t + 22, t + 17);
187 ad.atom_name = read_string(t + 12, 4);
188 ad.altloc = read_altloc(t[16]);
189 }
190 c.asu = compare_link_symops(record);
191 if (record.length() > 73) {
192 if (record[4] == 'R')
193 c.link_id = read_string(&record[72], 8);
194 else
195 c.reported_distance = read_double(&record[73], 5);
196 }
197 st.connections.emplace_back(c);
198 } else if (record[0] == 'C' || record[0] == 'c') { // CISPEP
199 if (record.length() < 22)
200 continue;
201 const char* r = record.c_str();
202 CisPep cispep;
203 cispep.partner_c.chain_name = read_string(r + 14, 2);
204 cispep.partner_c.res_id = read_res_id(r + 17, r + 11);
205 cispep.partner_n.chain_name = read_string(r + 28, 2);
206 cispep.partner_n.res_id = read_res_id(r + 31, r + 25);
207 // In files with a single model in the PDB CISPEP modNum is 0,
208 // but _struct_mon_prot_cis.pdbx_PDB_model_num is 1.
209 cispep.model_str = st.models.size() == 1 ? st.models[0].name
210 : read_string(r + 43, 3);
211 cispep.reported_angle = read_double(r + 53, 6);
212 st.cispeps.push_back(cispep);
213 }
214 }
215}
216
217template<typename Stream>
218Structure read_pdb_from_stream(Stream&& stream, const std::string& source,
219 const PdbReadOptions& options) {
220 int line_num = 0;
221 auto wrong = [&line_num](const std::string& msg) {
222 fail("Problem in line " + std::to_string(line_num) + ": " + msg);
223 };
224 Structure st;
225 st.input_format = CoorFormat::Pdb;
226 st.name = path_basename(source, {".gz", ".pdb"});
227 std::vector<std::string> conn_records;
228 Model *model = nullptr;
229 Chain *chain = nullptr;
230 Residue *resi = nullptr;
231 char line[122] = {0};
232 int max_line_length = options.max_line_length;
233 if (max_line_length <= 0 || max_line_length > 120)
234 max_line_length = 120;
235 bool after_ter = false;
236 Transform matrix;
237 std::unordered_map<ResidueId, int> resmap;
238 while (size_t len = copy_line_from_stream(line, max_line_length+1, stream)) {
239 ++line_num;
240 if (is_record_type(line, "ATOM") || is_record_type(line, "HETATM")) {
241 if (len < 55)
242 wrong("The line is too short to be correct:\n" + std::string(line));
243 std::string chain_name = read_string(line+20, 2);
244 ResidueId rid = read_res_id(line+22, line+17);
245
246 if (!chain || chain_name != chain->name) {
247 if (!model) {
248 // A single model usually doesn't have the MODEL record. Also,
249 // MD trajectories may have frames separated by ENDMDL without MODEL.
250 std::string name = std::to_string(st.models.size() + 1);
251 if (st.find_model(name))
252 wrong("ATOM/HETATM between models");
253 st.models.emplace_back(name);
254 model = &st.models.back();
255 }
256 const Chain* prev_part = model->find_chain(chain_name);
257 after_ter = prev_part &&
258 prev_part->residues[0].entity_type == EntityType::Polymer;
259 model->chains.emplace_back(chain_name);
260 chain = &model->chains.back();
261 resmap.clear();
262 resi = nullptr;
263 }
264 // Non-standard but widely used 4-character segment identifier.
265 // Left-justified, and may include a space in the middle.
266 // The segment may be a portion of a chain or a complete chain.
267 if (len > 72)
268 rid.segment = read_string(line+72, 4);
269 if (!resi || !resi->matches(rid)) {
270 auto it = resmap.find(rid);
271 // In normal PDB files it is fast enough to use
272 // resi = chain->find_residue(rid);
273 // but in pseudo-PDB files (such as MD files where millions
274 // of residues are in the same "chain") it is too slow.
275 if (it == resmap.end()) {
276 resmap.emplace(rid, (int) chain->residues.size());
277 chain->residues.emplace_back(rid);
278 resi = &chain->residues.back();
279
280 resi->het_flag = line[0] & ~0x20;
281 if (after_ter)
282 resi->entity_type = resi->is_water() ? EntityType::Water
284 } else {
285 resi = &chain->residues[it->second];
286 }
287 }
288
289 Atom atom;
290 atom.serial = read_serial(line+6);
291 atom.name = read_string(line+12, 4);
292 atom.altloc = read_altloc(line[16]);
293 atom.pos.x = read_double(line+30, 8);
294 atom.pos.y = read_double(line+38, 8);
295 atom.pos.z = read_double(line+46, 8);
296 if (len > 58)
297 atom.occ = (float) read_double(line+54, 6);
298 if (len > 64)
299 atom.b_iso = (float) read_double(line+60, 6);
300 if (len > 76 && (std::isalpha(line[76]) || std::isalpha(line[77])))
301 atom.element = Element(line + 76);
302 // Atom names HXXX are ambiguous, but Hg, He, Hf, Ho and Hs (almost)
303 // never have 4-character names, so H is assumed.
304 else if (alpha_up(line[12]) == 'H' && line[15] != ' ')
305 atom.element = El::H;
306 // Similarly Deuterium (DXXX), but here alternatives are Dy, Db and Ds.
307 // Only Dysprosium is present in the PDB - in a single entry as of 2022.
308 else if (alpha_up(line[12]) == 'D' && line[15] != ' ')
309 atom.element = El::D;
310 // Old versions of the PDB format had hydrogen names such as "1HB ".
311 // Some MD files use similar names for other elements ("1C4A" -> C).
312 else if (is_digit(line[12]))
313 atom.element = impl::find_single_letter_element(line[13]);
314 // ... or it can be "C210"
315 else if (is_digit(line[13]))
316 atom.element = impl::find_single_letter_element(line[12]);
317 else
318 atom.element = Element(line + 12);
319 atom.charge = (len > 78 ? read_charge(line[78], line[79]) : 0);
320 resi->atoms.emplace_back(atom);
321
322 } else if (is_record_type(line, "ANISOU")) {
323 if (!model || !chain || !resi || resi->atoms.empty())
324 wrong("ANISOU record not directly after ATOM/HETATM.");
325 // We assume that ANISOU refers to the last atom.
326 // Can it not be the case?
327 Atom &atom = resi->atoms.back();
328 if (atom.aniso.u11 != 0.)
329 wrong("Duplicated ANISOU record or not directly after ATOM/HETATM.");
330 atom.aniso.u11 = read_int(line+28, 7) * 1e-4f;
331 atom.aniso.u22 = read_int(line+35, 7) * 1e-4f;
332 atom.aniso.u33 = read_int(line+42, 7) * 1e-4f;
333 atom.aniso.u12 = read_int(line+49, 7) * 1e-4f;
334 atom.aniso.u13 = read_int(line+56, 7) * 1e-4f;
335 atom.aniso.u23 = read_int(line+63, 7) * 1e-4f;
336
337 } else if (is_record_type(line, "REMARK")) {
338 if (line[len-1] == '\n')
339 --len;
340 if (line[len-1] == '\r')
341 --len;
342 st.raw_remarks.emplace_back(line, line+len);
343 if (len <= 11)
344 continue;
345 int num = read_int(line + 7, 3);
346 // By default, we only look for resolution and REMARK 350.
347 // Other remarks are parsed in read_metadata_from_remarks()
348 if (num == 2) {
349 if (st.resolution == 0.0 && std::strstr(line, "ANGSTROM"))
350 st.resolution = read_double(line + 23, 7);
351 } else if (num == 3) {
352 if (st.resolution == 0.0 &&
353 std::strstr(line, "RESOLUTION RANGE HIGH (ANGSTROMS)"))
354 if (const char* colon = std::strchr(line + 44, ':'))
355 st.resolution = fast_atof(colon + 1);
356 } else if (num == 350) {
357 const char* colon = std::strchr(line+11, ':');
358 if (colon == line+22 && starts_with(line+11, "BIOMOLECULE")) {
359 st.assemblies.emplace_back(read_string(line+23, 20));
360 continue;
361 }
362 if (st.assemblies.empty())
363 continue;
364 Assembly& assembly = st.assemblies.back();
365 auto r350_key = [&](int cpos, const char* text) {
366 return colon == line + cpos && starts_with(line+11, text);
367 };
368 if (starts_with(line+11, " BIOMT")) {
369 if (read_matrix(matrix, line+13, len-13) == 3)
370 if (!assembly.generators.empty()) {
371 auto& opers = assembly.generators.back().operators;
372 opers.emplace_back();
373 opers.back().name = read_string(line+20, 3);
374 opers.back().transform = matrix;
375 matrix.set_identity();
376 }
377 } else if (r350_key(44, "AUTHOR DETERMINED")) {
378 assembly.author_determined = true;
379 assembly.oligomeric_details = read_string(line+45, 35);
380 } else if (r350_key(51, "SOFTWARE DETERMINED")) {
381 assembly.software_determined = true;
382 assembly.oligomeric_details = read_string(line+52, 28);
383 } else if (r350_key(24, "SOFTWARE USED")) {
384 assembly.software_name = read_string(line+25, 55);
385 } else if (r350_key(36, "TOTAL BURIED SURFACE AREA")) {
386 assembly.absa = read_double(line+37, 12);
387 } else if (r350_key(38, "SURFACE AREA OF THE COMPLEX")) {
388 assembly.ssa = read_double(line+39, 12);
389 } else if (r350_key(40, "CHANGE IN SOLVENT FREE ENERGY")) {
390 assembly.more = read_double(line+41, 12);
391 } else if (r350_key(40, "APPLY THE FOLLOWING TO CHAINS") ||
392 r350_key(40, " AND CHAINS")) {
393 if (line[11] == 'A') // first line - APPLY ...
394 assembly.generators.emplace_back();
395 else if (assembly.generators.empty())
396 continue;
397 split_str_into_multi(read_string(line+41, 39), ", ",
398 assembly.generators.back().chains);
399 }
400 }
401
402 } else if (is_record_type(line, "CONECT")) {
403 int serial = read_serial(line+6);
404 if (len >= 11 && serial != 0) {
405 std::vector<int>& bonded_atoms = st.conect_map[serial];
406 int limit = std::min(27, (int)len - 1);
407 for (int offset = 11; offset <= limit; offset += 5) {
408 int n = read_serial(line+offset);
409 if (n != 0)
410 bonded_atoms.push_back(n);
411 }
412 }
413
414 } else if (is_record_type(line, "SEQRES")) {
415 std::string chain_name = read_string(line+10, 2);
416 Entity& ent = impl::find_or_add(st.entities, chain_name);
417 ent.entity_type = EntityType::Polymer;
418 for (int i = 19; i < 68; i += 4) {
419 std::string res_name = read_string(line+i, 3);
420 if (!res_name.empty())
421 ent.full_sequence.emplace_back(res_name);
422 }
423
424 } else if (is_record_type(line, "MODRES")) {
425 ModRes modres;
426 modres.chain_name = read_string(line + 15, 2);
427 modres.res_id = read_res_id(line + 18, line + 12);
428 modres.parent_comp_id = read_string(line + 24, 3);
429 if (len >= 30)
430 // this field is named comment in PDB spec, but details in mmCIF
431 modres.details = read_string(line + 29, 41);
432 // Refmac's extension: 73-80 mod_id
433 // Check for spaces to make sure it's not an overflowed comment
434 if (len >= 73 && line[70] == ' ' && line[71] == ' ')
435 modres.mod_id = read_string(line + 72, 8);
436 st.mod_residues.push_back(modres);
437
438 } else if (is_record_type(line, "HETNAM")) {
439 if (len > 71 && line[70] == ' ') {
440 std::string full_code = read_string(line + 71, 8);
441 if (!full_code.empty())
442 st.shortened_ccd_codes.push_back({full_code, read_string(line + 11, 3)});
443 }
444
445 } else if (is_record_type(line, "DBREF")) { // DBREF or DBREF1 or DBREF2
446 std::string chain_name = read_string(line+11, 2);
447 Entity& ent = impl::find_or_add(st.entities, chain_name);
448 ent.entity_type = EntityType::Polymer;
449 if (line[5] == ' ' || line[5] == '1')
450 ent.dbrefs.emplace_back();
451 else if (ent.dbrefs.empty()) // DBREF2 without DBREF1?
452 continue;
453 Entity::DbRef& dbref = ent.dbrefs.back();
454 if (line[5] == ' ' || line[5] == '1') {
455 dbref.seq_begin = read_seq_id(line+14);
456 dbref.seq_end = read_seq_id(line+20);
457 dbref.db_name = read_string(line+26, 6);
458 if (line[5] == ' ') {
459 dbref.accession_code = read_string(line+33, 8);
460 dbref.id_code = read_string(line+42, 12);
461 dbref.db_begin.num = read_int(line+55, 5);
462 dbref.db_begin.icode = line[60];
463 dbref.db_end.num = read_int(line+62, 5);
464 dbref.db_end.icode = line[67];
465 } else { // line[5] == '1'
466 dbref.id_code = read_string(line+47, 20);
467 }
468 } else if (line[5] == '2') {
469 dbref.accession_code = read_string(line+18, 22);
470 dbref.db_begin.num = read_int(line+45, 10);
471 dbref.db_end.num = read_int(line+57, 10);
472 }
473 } else if (is_record_type(line, "HEADER")) {
474 if (len > 50)
475 st.info["_struct_keywords.pdbx_keywords"] = rtrim_str(std::string(line+10, 40));
476 if (len > 59) { // date in PDB has format 28-MAR-07
477 std::string date = pdb_date_format_to_iso(std::string(line+50, 9));
478 if (!date.empty())
479 st.info["_pdbx_database_status.recvd_initial_deposition_date"] = date;
480 }
481 if (len > 66) {
482 std::string entry_id = rtrim_str(std::string(line+62, 4));
483 if (!entry_id.empty())
484 st.info["_entry.id"] = entry_id;
485 }
486 } else if (is_record_type(line, "TITLE")) {
487 if (len > 10)
488 st.info["_struct.title"] += rtrim_str(std::string(line+10, len-10-1));
489
490 } else if (is_record_type(line, "KEYWDS")) {
491 if (len > 10)
492 st.info["_struct_keywords.text"] += rtrim_str(std::string(line+10, len-10-1));
493
494 } else if (is_record_type(line, "EXPDTA")) {
495 if (len > 10)
496 st.info["_exptl.method"] += trim_str(std::string(line+10, len-10-1));
497
498 } else if (is_record_type(line, "AUTHOR") && len > 10) {
499 std::string last;
500 if (!st.meta.authors.empty()) {
501 last = st.meta.authors.back();
502 st.meta.authors.pop_back();
503 }
504 size_t prev_size = st.meta.authors.size();
505 const char* start = skip_blank(line+10);
506 const char* end = rtrim_cstr(start, line+len);
507 split_str_into(std::string(start, end), ',', st.meta.authors);
508 if (!last.empty() && st.meta.authors.size() > prev_size) {
509 // the spaces were trimmed, we may need a space between words
510 if (last.back() != '-' && last.back() != '.')
511 last += ' ';
512 st.meta.authors[prev_size].insert(0, last);
513 }
514
515 } else if (is_record_type(line, "CRYST1")) {
516 if (len > 54)
517 st.cell.set(read_double(line+6, 9),
518 read_double(line+15, 9),
519 read_double(line+24, 9),
520 read_double(line+33, 7),
521 read_double(line+40, 7),
522 read_double(line+47, 7));
523 if (len > 56)
524 st.spacegroup_hm = read_string(line+55, 11);
525 if (len > 67) {
526 std::string z = read_string(line+66, 4);
527 if (!z.empty())
528 st.info["_cell.Z_PDB"] = z;
529 }
530 } else if (is_record_type(line, "MTRIXn")) {
531 if (read_matrix(matrix, line, len) == 3) {
532 std::string id = read_string(line+7, 3);
533 if (matrix.is_identity()) {
534 // store only ID that will be used when writing to file
535 st.info["_struct_ncs_oper.id"] = id;
536 } else {
537 bool given = len > 59 && line[59] == '1';
538 st.ncs.push_back({id, given, matrix});
539 matrix.set_identity();
540 }
541 }
542 } else if (is_record_type(line, "MODEL")) {
543 if (model && chain)
544 wrong("MODEL without ENDMDL?");
545 std::string name = std::to_string(read_int(line+10, 4));
546 model = &st.find_or_add_model(name);
547 if (!model->chains.empty())
548 wrong("duplicate MODEL number: " + name);
549 chain = nullptr;
550
551 } else if (is_record_type(line, "ENDMDL")) {
552 model = nullptr;
553 chain = nullptr;
554
555 } else if (is_record_type3(line, "TER")) { // finishes polymer chains
556 if (!chain || st.ter_status == 'e')
557 continue;
558 st.ter_status = 'y';
559 if (options.split_chain_on_ter) {
560 chain = nullptr;
561 // split_chain_on_ter is used for AMBER files that can have TER records
562 // in various places. So in such case TER doesn't imply entity_type.
563 continue;
564 }
565 // If we have 2+ TER records in one chain, they are used in non-standard
566 // way and should be better ignored (in all the chains).
567 if (after_ter) {
568 st.ter_status = 'e'; // all entity_types will be later set to Unknown
569 continue;
570 }
571 for (Residue& res : chain->residues) {
572 res.entity_type = EntityType::Polymer;
573 // Sanity check: water should not be marked as a polymer.
574 if GEMMI_UNLIKELY(res.is_water())
575 st.ter_status = 'e'; // all entity_types will be later set to Unknown
576 }
577 after_ter = true;
578 } else if (is_record_type(line, "SCALEn")) {
579 if (read_matrix(matrix, line, len) == 3) {
580 st.cell.set_matrices_from_fract(matrix);
581 matrix.set_identity();
582 }
583
584 } else if (is_record_type(line, "ORIGX")) {
585 st.has_origx = true;
586 read_matrix(st.origx, line, len);
587
588 } else if (is_record_type(line, "HELIX")) {
589 if (len < 40)
590 continue;
591 Helix helix;
592 helix.start.chain_name = read_string(line+18, 2);
593 helix.start.res_id = read_res_id(line+21, line+15);
594 helix.end.chain_name = read_string(line+30, 2);
595 helix.end.res_id = read_res_id(line+33, line+27);
596 helix.set_helix_class_as_int(read_int(line+38, 2));
597 if (len > 72)
598 helix.length = read_int(line+72, 5);
599 st.helices.emplace_back(helix);
600
601 } else if (is_record_type(line, "SHEET")) {
602 if (len < 40)
603 continue;
604 std::string sheet_id = read_string(line+11, 3);
605 Sheet& sheet = impl::find_or_add(st.sheets, sheet_id);
606 sheet.strands.emplace_back();
607 Sheet::Strand& strand = sheet.strands.back();
608 strand.start.chain_name = read_string(line+20, 2);
609 strand.start.res_id = read_res_id(line+22, line+17);
610 strand.end.chain_name = read_string(line+31, 2);
611 strand.end.res_id = read_res_id(line+33, line+28);
612 strand.sense = read_int(line+38, 2);
613 if (len > 67) {
614 // the SHEET record has no altloc for atoms of hydrogen bond
615 strand.hbond_atom2.atom_name = read_string(line+41, 4);
616 strand.hbond_atom2.chain_name = read_string(line+48, 2);
617 strand.hbond_atom2.res_id = read_res_id(line+50, line+45);
618 strand.hbond_atom1.atom_name = read_string(line+56, 4);
619 strand.hbond_atom1.chain_name = read_string(line+63, 2);
620 strand.hbond_atom1.res_id = read_res_id(line+65, line+60);
621 }
622
623 } else if (is_record_type(line, "SSBOND") ||
624 is_record_type(line, "LINK") ||
625 is_record_type(line, "CISPEP")) {
626 conn_records.emplace_back(line);
627
628 } else if (is_record_type3(line, "END")) {
629 break;
630 } else if (is_record_type(line, "data")) {
631 if (line[4] == '_' && !model)
632 fail("Incorrect file format (perhaps it is cif not pdb?): " + source);
633 } else if (is_record_type(line, "{\"da")) {
634 if (ialpha3_id(line+4) == ialpha3_id("ta_") && !model)
635 fail("Incorrect file format (perhaps it is mmJSON not pdb?): " + source);
636 }
637 }
638
639 // If we read a PDB header (they can be downloaded from RSCB) we have no
640 // models. User's code may not expect this. Usually, empty model will be
641 // handled more gracefully than no models.
642 if (st.models.empty())
643 st.models.emplace_back("1");
644
645 if (st.ter_status == 'e')
647
648 // Here we assign Residue::subchain, but only for chains with all
649 // Residue::entity_type assigned, i.e. for chains with TER.
650 assign_subchains(st, /*force=*/false, /*fail_if_unknown=*/false);
651
652 for (Chain& ch : st.models[0].chains)
653 if (Entity* entity = st.get_entity(ch.name))
654 if (auto polymer = ch.get_polymer())
655 entity->subchains.emplace_back(polymer.subchain_id());
656
657 st.setup_cell_images();
658
659 process_conn(st, conn_records);
660
661 for (std::string& name : st.meta.authors)
662 change_author_name_format_to_mmcif(name);
663
664 if (!options.skip_remarks)
666
668
669 return st;
670}
671
672} // namespace pdb_impl
673
674inline Structure read_pdb_file(const std::string& path,
676 auto f = file_open(path.c_str(), "rb");
677 return pdb_impl::read_pdb_from_stream(FileStream{f.get()}, path, options);
678}
679
680inline Structure read_pdb_from_memory(const char* data, size_t size,
681 const std::string& name,
683 return pdb_impl::read_pdb_from_stream(MemoryStream(data, size), name, options);
684}
685
686inline Structure read_pdb_string(const std::string& str,
687 const std::string& name,
689 return read_pdb_from_memory(str.c_str(), str.length(), name, options);
690}
691
692// A function for transparent reading of stdin and/or gzipped files.
693template<typename T>
695 if (input.is_stdin())
696 return pdb_impl::read_pdb_from_stream(FileStream{stdin}, "stdin", options);
697 if (input.is_compressed())
698 return pdb_impl::read_pdb_from_stream(input.get_uncompressing_stream(),
699 input.path(), options);
700 return read_pdb_file(input.path(), options);
701}
702
703} // namespace gemmi
704#endif
#define GEMMI_UNLIKELY(x)
Definition fail.hpp:39
Document read_string(const std::string &data)
Definition cif.hpp:303
const char * rtrim_cstr(const char *start, const char *end=nullptr)
Definition util.hpp:124
El find_element(const char *symbol)
Definition elem.hpp:267
Structure read_pdb_file(const std::string &path, PdbReadOptions options=PdbReadOptions())
Definition pdb.hpp:674
double fast_atof(const char *p, const char **endptr=nullptr)
Definition atof.hpp:32
size_t copy_line_from_stream(char *line, int size, Input &&in)
Definition input.hpp:155
constexpr int ialpha3_id(const char *s)
Definition util.hpp:309
std::string rtrim_str(const std::string &str)
Definition util.hpp:118
void read_metadata_from_remarks(Structure &st)
Definition remarks.hpp:533
GEMMI_DLL void assign_subchains(Structure &st, bool force, bool fail_if_unknown=true)
std::string path_basename(const std::string &path, std::initializer_list< const char * > exts)
Definition fileutil.hpp:23
fileptr_t file_open(const char *path, const char *mode)
Definition fileutil.hpp:39
void split_str_into_multi(const std::string &str, const char *seps, std::vector< std::string > &result)
Definition util.hpp:164
GEMMI_DLL void remove_entity_types(Structure &st)
Assigns entity_type=Unknown for all residues.
bool is_digit(char c)
Definition atox.hpp:43
Structure read_pdb_string(const std::string &str, const std::string &name, PdbReadOptions options=PdbReadOptions())
Definition pdb.hpp:686
bool is_metal(El el)
Definition elem.hpp:29
Structure read_pdb(T &&input, PdbReadOptions options=PdbReadOptions())
Definition pdb.hpp:694
bool starts_with(const std::string &str, const std::string &prefix)
Definition util.hpp:38
void fail(const std::string &msg)
Definition fail.hpp:59
void split_str_into(const std::string &str, S sep, std::vector< std::string > &result)
Definition util.hpp:145
bool is_space(char c)
Definition atox.hpp:23
std::string trim_str(const std::string &str)
Definition util.hpp:109
void restore_full_ccd_codes(Structure &st)
Definition polyheur.hpp:212
constexpr int ialpha4_id(const char *s)
Definition util.hpp:305
char alpha_up(char c)
Definition util.hpp:60
Structure read_pdb_from_memory(const char *data, size_t size, const std::string &name, PdbReadOptions options=PdbReadOptions())
Definition pdb.hpp:680
const char * skip_blank(const char *p)
Definition atox.hpp:47
options affecting how pdb file is read
Definition model.hpp:94
char icode
Definition seqid.hpp:56