21#include <unordered_map>
33template<
int N>
int read_base36(
const char* p) {
35 std::memcpy(zstr, p,
N);
36 return std::strtol(zstr,
nullptr, 36);
41inline bool is_record_type(
const char* s,
const char* record) {
45inline bool is_record_type3(
const char* s,
const char* record) {
50inline signed char read_charge(
char digit,
char sign) {
51 if (sign ==
' ' && digit ==
' ')
53 if (sign >=
'0' && sign <=
'9')
54 std::swap(digit, sign);
55 if (digit >=
'0' && digit <=
'9') {
56 if (sign !=
'+' && sign !=
'-' && sign !=
'\0' && !
is_space(sign))
57 fail(
"Wrong format for charge: " +
58 std::string(1, digit) + std::string(1, sign));
59 return (digit -
'0') * (sign ==
'-' ? -1 : 1);
65inline int read_matrix(Transform& t,
char* line,
size_t len) {
68 char n = line[5] -
'0';
69 if (n >= 1 && n <= 3) {
70 t.mat[n-1][0] = read_double(line+10, 10);
71 t.mat[n-1][1] = read_double(line+20, 10);
72 t.mat[n-1][2] = read_double(line+30, 10);
73 t.vec.at(n-1) = read_double(line+45, 10);
78inline SeqId read_seq_id(
const char* str) {
80 if (str[4] !=
'\r' && str[4] !=
'\n')
85 for (
int i = 4; i != 0; --i, ++str)
87 seqid.num = read_int(str, i);
91 seqid.num = read_base36<4>(str) - 466560 + 10000;
96inline ResidueId read_res_id(
const char* seq_id,
const char* name) {
97 return {read_seq_id(seq_id), {},
read_string(name, 3)};
100inline char read_altloc(
char c) {
return c ==
' ' ?
'\0' : c; }
102inline int read_serial(
const char* ptr) {
103 return ptr[0] <
'A' ? read_int(ptr, 5)
104 : read_base36<5>(ptr) - 16796160 + 100000;
109inline void change_author_name_format_to_mmcif(std::string& name) {
111 while (name[0] ==
' ')
112 name.erase(name.begin());
116 for (
size_t i = 1; i < pos+4 && i+1 < name.size(); ++i)
117 if (name[i] ==
'.' && name[i+1] !=
' ')
120 name = name.substr(pos) +
", " + name.substr(0, pos);
123inline Asu compare_link_symops(
const std::string& record) {
124 if (record.size() < 72)
134inline void complete_ssbond_atom(AtomAddress& ad,
const Model& mdl) {
136 const_CRA cra = mdl.find_cra(ad);
137 if (cra.residue && (!cra.atom || cra.atom->element !=
El::S))
138 if (
const Atom* a = cra.residue->find_by_element(
El::S)) {
139 ad.atom_name = a->name;
140 ad.altloc = a->altloc;
145void process_conn(Structure& st,
const std::vector<std::string>& conn_records) {
146 int disulf_count = 0;
147 int covale_count = 0;
148 int metalc_count = 0;
149 for (
const std::string& record : conn_records) {
150 if (record[0] ==
'S' || record[0] ==
's') {
151 if (record.length() < 32)
154 c.name =
"disulf" + std::to_string(++disulf_count);
156 const char* r = record.c_str();
158 c.partner1.res_id = read_res_id(r + 17, r + 11);
160 char res_id2[5] = {
' ',
' ',
' ',
' ',
' '};
161 std::memcpy(res_id2, r + 31, std::min((
size_t)5, record.length() - 31));
162 c.partner2.res_id = read_res_id(res_id2, r + 25);
163 c.asu = compare_link_symops(record);
164 if (record.length() > 73)
165 c.reported_distance = read_double(r + 73, 5);
166 complete_ssbond_atom(c.partner1, st.first_model());
167 complete_ssbond_atom(c.partner2, st.models[0]);
168 st.connections.emplace_back(c);
169 }
else if (record[0] ==
'L' || record[0] ==
'l') {
170 if (record.length() < 57)
176 c.name =
"metalc" + std::to_string(++metalc_count);
179 c.name =
"covale" + std::to_string(++covale_count);
182 for (
int i : {0, 1}) {
183 const char* t = record.c_str() + 30 * i;
184 AtomAddress& ad = (i == 0 ? c.partner1 : c.partner2);
186 ad.res_id = read_res_id(t + 22, t + 17);
188 ad.altloc = read_altloc(t[16]);
190 c.asu = compare_link_symops(record);
191 if (record.length() > 73) {
192 if (record[4] ==
'R')
195 c.reported_distance = read_double(&record[73], 5);
197 st.connections.emplace_back(c);
198 }
else if (record[0] ==
'C' || record[0] ==
'c') {
199 if (record.length() < 22)
201 const char* r = record.c_str();
203 cispep.partner_c.chain_name =
read_string(r + 14, 2);
204 cispep.partner_c.res_id = read_res_id(r + 17, r + 11);
205 cispep.partner_n.chain_name =
read_string(r + 28, 2);
206 cispep.partner_n.res_id = read_res_id(r + 31, r + 25);
209 cispep.model_str = st.models.size() == 1 ? st.models[0].name
211 cispep.reported_angle = read_double(r + 53, 6);
212 st.cispeps.push_back(cispep);
217template<
typename Stream>
218Structure read_pdb_from_stream(Stream&& stream,
const std::string& source,
219 const PdbReadOptions& options) {
221 auto wrong = [&line_num](
const std::string& msg) {
222 fail(
"Problem in line " + std::to_string(line_num) +
": " + msg);
227 std::vector<std::string> conn_records;
228 Model *model =
nullptr;
229 Chain *chain =
nullptr;
230 Residue *resi =
nullptr;
231 char line[122] = {0};
232 int max_line_length = options.max_line_length;
233 if (max_line_length <= 0 || max_line_length > 120)
234 max_line_length = 120;
235 bool after_ter =
false;
237 std::unordered_map<ResidueId, int> resmap;
240 if (is_record_type(line,
"ATOM") || is_record_type(line,
"HETATM")) {
242 wrong(
"The line is too short to be correct:\n" + std::string(line));
244 ResidueId rid = read_res_id(line+22, line+17);
246 if (!chain || chain_name != chain->name) {
250 std::string name = std::to_string(st.models.size() + 1);
251 if (st.find_model(name))
252 wrong(
"ATOM/HETATM between models");
253 st.models.emplace_back(name);
254 model = &st.models.back();
256 const Chain* prev_part = model->find_chain(chain_name);
257 after_ter = prev_part &&
259 model->chains.emplace_back(chain_name);
260 chain = &model->chains.back();
269 if (!resi || !resi->matches(rid)) {
270 auto it = resmap.find(rid);
275 if (it == resmap.end()) {
276 resmap.emplace(rid, (
int) chain->residues.size());
277 chain->residues.emplace_back(rid);
278 resi = &chain->residues.back();
280 resi->het_flag = line[0] & ~0x20;
285 resi = &chain->residues[it->second];
290 atom.serial = read_serial(line+6);
292 atom.altloc = read_altloc(line[16]);
293 atom.pos.x = read_double(line+30, 8);
294 atom.pos.y = read_double(line+38, 8);
295 atom.pos.z = read_double(line+46, 8);
297 atom.occ = (float) read_double(line+54, 6);
299 atom.b_iso = (float) read_double(line+60, 6);
300 if (len > 76 && (std::isalpha(line[76]) || std::isalpha(line[77])))
301 atom.element = Element(line + 76);
304 else if (
alpha_up(line[12]) ==
'H' && line[15] !=
' ')
305 atom.element =
El::H;
308 else if (
alpha_up(line[12]) ==
'D' && line[15] !=
' ')
309 atom.element =
El::D;
313 atom.element = impl::find_single_letter_element(line[13]);
316 atom.element = impl::find_single_letter_element(line[12]);
318 atom.element = Element(line + 12);
319 atom.charge = (len > 78 ? read_charge(line[78], line[79]) : 0);
320 resi->atoms.emplace_back(atom);
322 }
else if (is_record_type(line,
"ANISOU")) {
323 if (!model || !chain || !resi || resi->atoms.empty())
324 wrong(
"ANISOU record not directly after ATOM/HETATM.");
327 Atom &atom = resi->atoms.back();
328 if (atom.aniso.u11 != 0.)
329 wrong(
"Duplicated ANISOU record or not directly after ATOM/HETATM.");
330 atom.aniso.u11 = read_int(line+28, 7) * 1e-4f;
331 atom.aniso.u22 = read_int(line+35, 7) * 1e-4f;
332 atom.aniso.u33 = read_int(line+42, 7) * 1e-4f;
333 atom.aniso.u12 = read_int(line+49, 7) * 1e-4f;
334 atom.aniso.u13 = read_int(line+56, 7) * 1e-4f;
335 atom.aniso.u23 = read_int(line+63, 7) * 1e-4f;
337 }
else if (is_record_type(line,
"REMARK")) {
338 if (line[len-1] ==
'\n')
340 if (line[len-1] ==
'\r')
342 st.raw_remarks.emplace_back(line, line+len);
345 int num = read_int(line + 7, 3);
349 if (st.resolution == 0.0 && std::strstr(line,
"ANGSTROM"))
350 st.resolution = read_double(line + 23, 7);
351 }
else if (num == 3) {
352 if (st.resolution == 0.0 &&
353 std::strstr(line,
"RESOLUTION RANGE HIGH (ANGSTROMS)"))
354 if (
const char* colon = std::strchr(line + 44,
':'))
356 }
else if (num == 350) {
357 const char* colon = std::strchr(line+11,
':');
358 if (colon == line+22 &&
starts_with(line+11,
"BIOMOLECULE")) {
359 st.assemblies.emplace_back(
read_string(line+23, 20));
362 if (st.assemblies.empty())
364 Assembly& assembly = st.assemblies.back();
365 auto r350_key = [&](
int cpos,
const char* text) {
366 return colon == line + cpos &&
starts_with(line+11, text);
369 if (read_matrix(matrix, line+13, len-13) == 3)
370 if (!assembly.generators.empty()) {
371 auto& opers = assembly.generators.back().operators;
372 opers.emplace_back();
374 opers.back().transform = matrix;
375 matrix.set_identity();
377 }
else if (r350_key(44,
"AUTHOR DETERMINED")) {
378 assembly.author_determined =
true;
379 assembly.oligomeric_details =
read_string(line+45, 35);
380 }
else if (r350_key(51,
"SOFTWARE DETERMINED")) {
381 assembly.software_determined =
true;
382 assembly.oligomeric_details =
read_string(line+52, 28);
383 }
else if (r350_key(24,
"SOFTWARE USED")) {
385 }
else if (r350_key(36,
"TOTAL BURIED SURFACE AREA")) {
386 assembly.absa = read_double(line+37, 12);
387 }
else if (r350_key(38,
"SURFACE AREA OF THE COMPLEX")) {
388 assembly.ssa = read_double(line+39, 12);
389 }
else if (r350_key(40,
"CHANGE IN SOLVENT FREE ENERGY")) {
390 assembly.more = read_double(line+41, 12);
391 }
else if (r350_key(40,
"APPLY THE FOLLOWING TO CHAINS") ||
392 r350_key(40,
" AND CHAINS")) {
394 assembly.generators.emplace_back();
395 else if (assembly.generators.empty())
398 assembly.generators.back().chains);
402 }
else if (is_record_type(line,
"CONECT")) {
403 int serial = read_serial(line+6);
404 if (len >= 11 && serial != 0) {
405 std::vector<int>& bonded_atoms = st.conect_map[serial];
406 int limit = std::min(27, (
int)len - 1);
407 for (
int offset = 11; offset <= limit; offset += 5) {
408 int n = read_serial(line+offset);
410 bonded_atoms.push_back(n);
414 }
else if (is_record_type(line,
"SEQRES")) {
416 Entity& ent = impl::find_or_add(st.entities, chain_name);
418 for (
int i = 19; i < 68; i += 4) {
420 if (!res_name.empty())
421 ent.full_sequence.emplace_back(res_name);
424 }
else if (is_record_type(line,
"MODRES")) {
427 modres.res_id = read_res_id(line + 18, line + 12);
434 if (len >= 73 && line[70] ==
' ' && line[71] ==
' ')
436 st.mod_residues.push_back(modres);
438 }
else if (is_record_type(line,
"HETNAM")) {
439 if (len > 71 && line[70] ==
' ') {
441 if (!full_code.empty())
442 st.shortened_ccd_codes.push_back({full_code,
read_string(line + 11, 3)});
445 }
else if (is_record_type(line,
"DBREF")) {
447 Entity& ent = impl::find_or_add(st.entities, chain_name);
449 if (line[5] ==
' ' || line[5] ==
'1')
450 ent.dbrefs.emplace_back();
451 else if (ent.dbrefs.empty())
453 Entity::DbRef& dbref = ent.dbrefs.back();
454 if (line[5] ==
' ' || line[5] ==
'1') {
455 dbref.seq_begin = read_seq_id(line+14);
456 dbref.seq_end = read_seq_id(line+20);
458 if (line[5] ==
' ') {
461 dbref.db_begin.num = read_int(line+55, 5);
462 dbref.db_begin.icode = line[60];
463 dbref.db_end.num = read_int(line+62, 5);
464 dbref.db_end.icode = line[67];
468 }
else if (line[5] ==
'2') {
470 dbref.db_begin.num = read_int(line+45, 10);
471 dbref.db_end.num = read_int(line+57, 10);
473 }
else if (is_record_type(line,
"HEADER")) {
475 st.info[
"_struct_keywords.pdbx_keywords"] =
rtrim_str(std::string(line+10, 40));
477 std::string date = pdb_date_format_to_iso(std::string(line+50, 9));
479 st.info[
"_pdbx_database_status.recvd_initial_deposition_date"] = date;
482 std::string entry_id =
rtrim_str(std::string(line+62, 4));
483 if (!entry_id.empty())
484 st.info[
"_entry.id"] = entry_id;
486 }
else if (is_record_type(line,
"TITLE")) {
488 st.info[
"_struct.title"] +=
rtrim_str(std::string(line+10, len-10-1));
490 }
else if (is_record_type(line,
"KEYWDS")) {
492 st.info[
"_struct_keywords.text"] +=
rtrim_str(std::string(line+10, len-10-1));
494 }
else if (is_record_type(line,
"EXPDTA")) {
496 st.info[
"_exptl.method"] +=
trim_str(std::string(line+10, len-10-1));
498 }
else if (is_record_type(line,
"AUTHOR") && len > 10) {
500 if (!st.meta.authors.empty()) {
501 last = st.meta.authors.back();
502 st.meta.authors.pop_back();
504 size_t prev_size = st.meta.authors.size();
506 const char* end =
rtrim_cstr(start, line+len);
508 if (!last.empty() && st.meta.authors.size() > prev_size) {
510 if (last.back() !=
'-' && last.back() !=
'.')
512 st.meta.authors[prev_size].insert(0, last);
515 }
else if (is_record_type(line,
"CRYST1")) {
517 st.cell.set(read_double(line+6, 9),
518 read_double(line+15, 9),
519 read_double(line+24, 9),
520 read_double(line+33, 7),
521 read_double(line+40, 7),
522 read_double(line+47, 7));
528 st.info[
"_cell.Z_PDB"] = z;
530 }
else if (is_record_type(line,
"MTRIXn")) {
531 if (read_matrix(matrix, line, len) == 3) {
533 if (matrix.is_identity()) {
535 st.info[
"_struct_ncs_oper.id"] = id;
537 bool given = len > 59 && line[59] ==
'1';
538 st.ncs.push_back({id, given, matrix});
539 matrix.set_identity();
542 }
else if (is_record_type(line,
"MODEL")) {
544 wrong(
"MODEL without ENDMDL?");
545 std::string name = std::to_string(read_int(line+10, 4));
546 model = &st.find_or_add_model(name);
547 if (!model->chains.empty())
548 wrong(
"duplicate MODEL number: " + name);
551 }
else if (is_record_type(line,
"ENDMDL")) {
555 }
else if (is_record_type3(line,
"TER")) {
556 if (!chain || st.ter_status ==
'e')
559 if (options.split_chain_on_ter) {
571 for (Residue& res : chain->residues) {
578 } else if (is_record_type(line, "SCALEn")) {
579 if (read_matrix(matrix, line, len) == 3) {
580 st.cell.set_matrices_from_fract(matrix);
581 matrix.set_identity();
584 }
else if (is_record_type(line,
"ORIGX")) {
586 read_matrix(st.origx, line, len);
588 }
else if (is_record_type(line,
"HELIX")) {
593 helix.start.res_id = read_res_id(line+21, line+15);
595 helix.end.res_id = read_res_id(line+33, line+27);
596 helix.set_helix_class_as_int(read_int(line+38, 2));
598 helix.length = read_int(line+72, 5);
599 st.helices.emplace_back(helix);
601 }
else if (is_record_type(line,
"SHEET")) {
605 Sheet& sheet = impl::find_or_add(st.sheets, sheet_id);
606 sheet.strands.emplace_back();
607 Sheet::Strand& strand = sheet.strands.back();
609 strand.start.res_id = read_res_id(line+22, line+17);
611 strand.end.res_id = read_res_id(line+33, line+28);
612 strand.sense = read_int(line+38, 2);
615 strand.hbond_atom2.atom_name =
read_string(line+41, 4);
616 strand.hbond_atom2.chain_name =
read_string(line+48, 2);
617 strand.hbond_atom2.res_id = read_res_id(line+50, line+45);
618 strand.hbond_atom1.atom_name =
read_string(line+56, 4);
619 strand.hbond_atom1.chain_name =
read_string(line+63, 2);
620 strand.hbond_atom1.res_id = read_res_id(line+65, line+60);
623 }
else if (is_record_type(line,
"SSBOND") ||
624 is_record_type(line,
"LINK") ||
625 is_record_type(line,
"CISPEP")) {
626 conn_records.emplace_back(line);
628 }
else if (is_record_type3(line,
"END")) {
630 }
else if (is_record_type(line,
"data")) {
631 if (line[4] ==
'_' && !model)
632 fail(
"Incorrect file format (perhaps it is cif not pdb?): " + source);
633 }
else if (is_record_type(line,
"{\"da")) {
635 fail(
"Incorrect file format (perhaps it is mmJSON not pdb?): " + source);
642 if (st.models.empty())
643 st.models.emplace_back(
"1");
645 if (st.ter_status ==
'e')
652 for (Chain& ch : st.models[0].chains)
653 if (Entity* entity = st.get_entity(ch.name))
654 if (auto polymer = ch.get_polymer())
655 entity->subchains.emplace_back(polymer.subchain_id());
657 st.setup_cell_images();
659 process_conn(st, conn_records);
661 for (std::string& name : st.meta.authors)
662 change_author_name_format_to_mmcif(name);
664 if (!options.skip_remarks)
681 const std::string& name,
687 const std::string& name,
695 if (
input.is_stdin())
697 if (
input.is_compressed())
698 return pdb_impl::read_pdb_from_stream(
input.get_uncompressing_stream(),
#define GEMMI_UNLIKELY(x)
Document read_string(const std::string &data)
const char * rtrim_cstr(const char *start, const char *end=nullptr)
El find_element(const char *symbol)
Structure read_pdb_file(const std::string &path, PdbReadOptions options=PdbReadOptions())
double fast_atof(const char *p, const char **endptr=nullptr)
size_t copy_line_from_stream(char *line, int size, Input &&in)
constexpr int ialpha3_id(const char *s)
std::string rtrim_str(const std::string &str)
void read_metadata_from_remarks(Structure &st)
GEMMI_DLL void assign_subchains(Structure &st, bool force, bool fail_if_unknown=true)
std::string path_basename(const std::string &path, std::initializer_list< const char * > exts)
fileptr_t file_open(const char *path, const char *mode)
void split_str_into_multi(const std::string &str, const char *seps, std::vector< std::string > &result)
GEMMI_DLL void remove_entity_types(Structure &st)
Assigns entity_type=Unknown for all residues.
Structure read_pdb_string(const std::string &str, const std::string &name, PdbReadOptions options=PdbReadOptions())
Structure read_pdb(T &&input, PdbReadOptions options=PdbReadOptions())
bool starts_with(const std::string &str, const std::string &prefix)
void fail(const std::string &msg)
void split_str_into(const std::string &str, S sep, std::vector< std::string > &result)
std::string trim_str(const std::string &str)
void restore_full_ccd_codes(Structure &st)
constexpr int ialpha4_id(const char *s)
Structure read_pdb_from_memory(const char *data, size_t size, const std::string &name, PdbReadOptions options=PdbReadOptions())
const char * skip_blank(const char *p)
options affecting how pdb file is read