Gemmi C++ API
Loading...
Searching...
No Matches
align.hpp
Go to the documentation of this file.
1// Copyright 2020 Global Phasing Ltd.
2//
3// Sequence alignment, label_seq_id assignment, structure superposition.
4
5#ifndef GEMMI_ALIGN_HPP_
6#define GEMMI_ALIGN_HPP_
7
8#include "model.hpp"
9#include "seqalign.hpp" // for align_sequences
10#include "qcp.hpp" // for superpose_positions
11#include "polyheur.hpp" // for are_connected3
12
13namespace gemmi {
14
15// Sequence alignment and label_seq_id assignment
16
17// helper function for sequence alignment
18inline std::vector<int> prepare_target_gapo(const ConstResidueSpan& polymer,
19 PolymerType polymer_type,
20 const AlignmentScoring* scoring=nullptr) {
21 if (!scoring)
23 std::vector<int> gaps;
24 gaps.reserve(polymer.size());
25 gaps.push_back(0); // free gap opening at the beginning of sequence
26 if (is_polypeptide(polymer_type) || is_polynucleotide(polymer_type)) {
27 auto first_conformer = polymer.first_conformer();
28 auto res = first_conformer.begin();
29 for (auto next_res = res; ++next_res != first_conformer.end(); res = next_res) {
30 bool connected = are_connected3(*res, *next_res, polymer_type);
31 gaps.push_back(connected ? scoring->bad_gapo : scoring->good_gapo);
32 }
33 gaps.push_back(0); // free gap after the end of chain
34 }
35 return gaps;
36}
37
38// pre: !!polymer
40 const std::vector<std::string>& full_seq,
41 const ConstResidueSpan& polymer,
42 PolymerType polymer_type,
43 const AlignmentScoring* scoring=nullptr) {
44 std::map<std::string, std::uint8_t> encoding;
45 if (!scoring)
47 for (const std::string& res_name : scoring->matrix_encoding)
48 encoding.emplace(res_name, (std::uint8_t)encoding.size());
49 for (const Residue& res : polymer)
50 encoding.emplace(res.name, (std::uint8_t)encoding.size());
51 for (const std::string& mon_list : full_seq)
52 encoding.emplace(Entity::first_mon(mon_list), (std::uint8_t)encoding.size());
53 if (encoding.size() > 255)
54 return AlignmentResult();
55
56 std::vector<std::uint8_t> encoded_full_seq(full_seq.size());
57 for (size_t i = 0; i != full_seq.size(); ++i)
58 encoded_full_seq[i] = encoding.at(Entity::first_mon(full_seq[i]));
59
60 std::vector<std::uint8_t> encoded_model_seq;
61 encoded_model_seq.reserve(polymer.size());
62 for (const Residue& res : polymer.first_conformer())
63 encoded_model_seq.push_back(encoding.at(res.name));
64
65 return align_sequences(encoded_full_seq, encoded_model_seq,
66 prepare_target_gapo(polymer, polymer_type, scoring),
67 (std::uint8_t)encoding.size(), *scoring);
68}
69
70// check for exact match between model sequence and full sequence (SEQRES)
71inline bool seqid_matches_seqres(const ConstResidueSpan& polymer,
72 const Entity& ent) {
73 if (ent.full_sequence.size() != polymer.size())
74 return false;
75 int idx = 0;
76 for (const Residue& res : polymer) {
77 if (ent.full_sequence[idx] != res.name ||
78 ++idx != *res.seqid.num || res.seqid.has_icode())
79 return false;
80 }
81 return true;
82}
83
84inline void clear_sequences(Structure& st) {
85 for (Entity& ent : st.entities) {
86 ent.full_sequence.clear();
87 ent.dbrefs.clear();
88 ent.sifts_unp_acc.clear();
89 }
90}
91
93void assign_best_sequences(Structure& st, const std::vector<std::string>& fasta_sequences);
94
95// Uses sequence alignment (model to SEQRES) to assign label_seq.
96// force: assign label_seq even if full sequence is not known (assumes no gaps)
98 const Entity* ent, bool force) {
99 AlignmentResult result;
100
101 // sequence not known
102 if (!ent || ent->full_sequence.empty()) {
103 if (!force)
104 return;
105 PolymerType ptype = get_or_check_polymer_type(ent, polymer);
106 const Residue* prev = nullptr;
107 for (const Residue& res : polymer.first_conformer()) {
108 if (prev && !are_connected3(*prev, res, ptype))
109 result.push_cigar(1, 1); // assume a single insertion
110 result.push_cigar(0, 1);
111 prev = &res;
112 }
113
114 // exact match - common case that doesn't require alignment
115 } else if (seqid_matches_seqres(polymer, *ent)) {
116 result.push_cigar(0, (int)ent->full_sequence.size());
117
118 // sequence alignment
119 } else {
120 PolymerType ptype = get_or_check_polymer_type(ent, polymer);
121 result = align_sequence_to_polymer(ent->full_sequence, polymer, ptype);
122 }
123
124 auto res_group = polymer.first_conformer().begin();
125 int id = 1;
126 for (AlignmentResult::Item item : result.cigar) {
127 switch (item.op()) {
128 case 'I':
129 id += item.len();
130 break;
131 case 'D': // leaving label_seq as it is
132 for (uint32_t i = 0; i < item.len(); ++i)
133 res_group++;
134 break;
135 case 'M': // not checking for mismatches
136 for (uint32_t i = 0; i < item.len(); ++i, ++id)
137 for (Residue* res = &*res_group++; res != &*res_group; ++res)
138 res->label_seq = id;
139 break;
140 }
141 }
142}
143
145 for (Model& model : st.models)
146 for (Chain& chain : model.chains)
147 for (Residue& res : chain.residues)
148 res.label_seq = Residue::OptionalNum();
149}
150
151inline void assign_label_seq_id(Structure& st, bool force) {
152 for (Model& model : st.models)
153 for (Chain& chain : model.chains)
154 if (ResidueSpan polymer = chain.get_polymer())
155 if (!polymer.front().label_seq || !polymer.back().label_seq) {
156 const Entity* ent = st.get_entity_of(polymer);
157 assign_label_seq_to_polymer(polymer, ent, force);
158 }
159}
160
161
162// superposition
163
164enum class SupSelect {
165 CaP, // only Ca (aminoacids) or P (nucleotides) atoms
166 MainChain, // only main chain atoms
167 All
168};
169
170inline void prepare_positions_for_superposition(std::vector<Position>& pos1,
171 std::vector<Position>& pos2,
172 ConstResidueSpan fixed,
173 ConstResidueSpan movable,
174 PolymerType ptype,
175 SupSelect sel,
176 char altloc='\0',
177 std::vector<int>* ca_offsets=nullptr) {
179 movable, ptype,
181 auto it1 = fixed.first_conformer().begin();
182 auto it2 = movable.first_conformer().begin();
183 std::vector<AtomNameElement> used_atoms;
184 bool is_na = is_polynucleotide(ptype);
185 const AtomNameElement* ca_p = nullptr;
186 if (sel == SupSelect::CaP) {
187 used_atoms.push_back({is_na ? "P" : "CA", is_na ? El::P : El::C});
188 } else if (sel == SupSelect::MainChain) {
189 used_atoms = get_mainchain_atoms(ptype);
190 ca_p = &used_atoms[is_na ? 0 : 1];
191 }
192 for (AlignmentResult::Item item : result.cigar) {
193 char op = item.op();
194 for (uint32_t i = 0; i < item.len(); ++i) {
195 int ca_offset = -1;
196 if (op == 'M' && it1->name == it2->name) {
197 if (!used_atoms.empty()) {
198 for (const AtomNameElement& ane : used_atoms) {
199 const Atom* a1 = it1->find_atom(ane.atom_name, altloc, ane.el);
200 const Atom* a2 = it2->find_atom(ane.atom_name, altloc, ane.el);
201 if (a1 && a2) {
202 if (&ane == ca_p)
203 ca_offset = (int)pos1.size();
204 pos1.push_back(a1->pos);
205 pos2.push_back(a2->pos);
206 }
207 }
208 } else {
209 for (const Atom& a1 : it1->atoms)
210 if (a1.altloc_matches(altloc))
211 if (const Atom* a2 = it2->find_atom(a1.name, altloc, a1.element)) {
212 pos1.push_back(a1.pos);
213 pos2.push_back(a2->pos);
214 }
215 }
216 }
217 if (op == 'M' || op == 'I') {
218 ++it1;
219 if (ca_offsets)
220 ca_offsets->push_back(ca_offset);
221 }
222 if (op == 'M' || op == 'D')
223 ++it2;
224 }
225 }
226}
227
229 ConstResidueSpan movable,
230 PolymerType ptype,
231 SupSelect sel,
232 char altloc='\0') {
233 std::vector<Position> pos1, pos2;
234 prepare_positions_for_superposition(pos1, pos2, fixed, movable, ptype, sel, altloc);
235 SupResult r;
236 r.count = pos1.size();
237 double sd = 0;
238 for (size_t i = 0; i != pos1.size(); ++i)
239 sd += pos1[i].dist_sq(pos2[i]);
240 r.rmsd = std::sqrt(sd / r.count);
241 return r;
242}
243
245 ConstResidueSpan movable,
246 PolymerType ptype,
247 SupSelect sel,
248 int trim_cycles=0,
249 double trim_cutoff=2.0,
250 char altloc='\0') {
251 std::vector<Position> pos1, pos2;
252 prepare_positions_for_superposition(pos1, pos2, fixed, movable, ptype, sel, altloc);
253 const double* weights = nullptr;
254 size_t len = pos1.size();
255 SupResult sr = superpose_positions(pos1.data(), pos2.data(), len, weights);
256
257 for (int n = 0; n < trim_cycles; ++n) {
258 double max_dist_sq = sq(trim_cutoff * sr.rmsd);
259 size_t p = 0;
260 for (size_t i = 0; i != len; ++i) {
261 Vec3 m2 = sr.transform.apply(pos2[i]);
262 if (m2.dist_sq(pos1[i]) <= max_dist_sq) {
263 if (i != p) {
264 pos1[p] = pos1[i];
265 pos2[p] = pos2[i];
266 }
267 ++p;
268 }
269 }
270 if (p == len)
271 break;
272 len = p;
273 if (len < 3)
274 fail("in calculate_superposition(): only ", std::to_string(len),
275 " atoms after trimming");
276 sr = superpose_positions(pos1.data(), pos2.data(), len, weights);
277 }
278
279 return sr;
280}
281
282// Returns superpositions for all residues in fixed.first_conformer(),
283// performed by superposing backbone in radius=10.0 from residue's Ca.
284inline std::vector<SupResult> calculate_superpositions_in_moving_window(
285 ConstResidueSpan fixed,
286 ConstResidueSpan movable,
287 PolymerType ptype,
288 double radius=10.0) {
289 const double radius_sq = radius * radius;
290 std::vector<Position> pos1, pos2;
291 char altloc = '\0';
293 std::vector<int> ca_offsets;
294 prepare_positions_for_superposition(pos1, pos2, fixed, movable, ptype,
295 sel, altloc, &ca_offsets);
296 const double* weights = nullptr;
297 std::vector<SupResult> result;
298 for (int offset : ca_offsets) {
299 if (offset == -1) {
300 result.push_back(SupResult{NAN, 0, {}, {}, {}});
301 continue;
302 }
303 const Position& ca_pos = pos1[offset];
304 int a = offset;
305 while (a > 0 && ca_pos.dist_sq(pos1[a-1]) < radius_sq)
306 --a;
307 int b = offset;
308 while (b+1 < (int)pos1.size() && ca_pos.dist_sq(pos1[b+1]) < radius_sq)
309 ++b;
310 result.push_back(superpose_positions(&pos1[a], &pos2[a], b-a+1, weights));
311 }
312 return result;
313}
314
315} // namespace gemmi
316#endif
#define GEMMI_DLL
Definition fail.hpp:53
void clear_sequences(Structure &st)
Definition align.hpp:84
void assign_label_seq_id(Structure &st, bool force)
Definition align.hpp:151
void assign_label_seq_to_polymer(ResidueSpan &polymer, const Entity *ent, bool force)
Definition align.hpp:97
void prepare_positions_for_superposition(std::vector< Position > &pos1, std::vector< Position > &pos2, ConstResidueSpan fixed, ConstResidueSpan movable, PolymerType ptype, SupSelect sel, char altloc='\0', std::vector< int > *ca_offsets=nullptr)
Definition align.hpp:170
std::vector< AtomNameElement > get_mainchain_atoms(PolymerType ptype)
Definition polyheur.hpp:31
SupResult superpose_positions(const Position *pos1, const Position *pos2, size_t len, const double *weight)
Definition qcp.hpp:297
bool is_polypeptide(PolymerType pt)
Definition metadata.hpp:211
std::vector< SupResult > calculate_superpositions_in_moving_window(ConstResidueSpan fixed, ConstResidueSpan movable, PolymerType ptype, double radius=10.0)
Definition align.hpp:284
PolymerType get_or_check_polymer_type(const Entity *ent, const ConstResidueSpan &polymer)
Definition polyheur.hpp:22
GEMMI_DLL void assign_best_sequences(Structure &st, const std::vector< std::string > &fasta_sequences)
void clear_label_seq_id(Structure &st)
Definition align.hpp:144
AlignmentResult align_sequence_to_polymer(const std::vector< std::string > &full_seq, const ConstResidueSpan &polymer, PolymerType polymer_type, const AlignmentScoring *scoring=nullptr)
Definition align.hpp:39
SupResult calculate_current_rmsd(ConstResidueSpan fixed, ConstResidueSpan movable, PolymerType ptype, SupSelect sel, char altloc='\0')
Definition align.hpp:228
AlignmentResult align_sequences(const std::vector< std::uint8_t > &query, const std::vector< std::uint8_t > &target, const std::vector< int > &target_gapo, std::uint8_t m, const AlignmentScoring &scoring)
All values in query and target must be less then m.
Definition seqalign.hpp:194
bool is_polynucleotide(PolymerType pt)
Definition metadata.hpp:215
constexpr float sq(float x)
Definition math.hpp:34
bool seqid_matches_seqres(const ConstResidueSpan &polymer, const Entity &ent)
Definition align.hpp:71
Vec3_< double > Vec3
Definition math.hpp:101
SupResult calculate_superposition(ConstResidueSpan fixed, ConstResidueSpan movable, PolymerType ptype, SupSelect sel, int trim_cycles=0, double trim_cutoff=2.0, char altloc='\0')
Definition align.hpp:244
bool are_connected3(const Residue &r1, const Residue &r2, PolymerType ptype)
are_connected3() = are_connected() + fallback to are_connected2()
Definition polyheur.hpp:87
SupSelect
Definition align.hpp:164
void fail(const std::string &msg)
Definition fail.hpp:59
std::vector< int > prepare_target_gapo(const ConstResidueSpan &polymer, PolymerType polymer_type, const AlignmentScoring *scoring=nullptr)
Definition align.hpp:18
void push_cigar(std::uint32_t op, int len)
Definition seqalign.hpp:183
std::vector< Item > cigar
Definition seqalign.hpp:79
static const AlignmentScoring * partial_model()
Definition seqalign.hpp:35
static const AlignmentScoring * blosum62()
Definition seqalign.hpp:39
Represents atom site in macromolecular structure (~100 bytes).
Definition model.hpp:112
Position pos
Definition model.hpp:123
std::vector< std::string > extract_sequence() const
Definition model.hpp:319
ConstUniqProxy< Residue, ConstResidueSpan > first_conformer() const
Definition model.hpp:304
std::vector< std::string > full_sequence
SEQRES or entity_poly_seq with microheterogeneity as comma-separated names.
Definition metadata.hpp:238
static std::string first_mon(const std::string &mon_list)
Definition metadata.hpp:241
Coordinates in Angstroms - orthogonal (Cartesian) coordinates.
Definition unitcell.hpp:32
UniqProxy< Residue, ResidueSpan > first_conformer()
Definition model.hpp:382
std::size_t size() const
Definition span.hpp:66
Entity * get_entity_of(const ConstResidueSpan &sub)
Definition model.hpp:1000
std::vector< Model > models
Definition model.hpp:925
std::vector< Entity > entities
Definition model.hpp:927
Transform transform
Definition qcp.hpp:90
double rmsd
Definition qcp.hpp:87
size_t count
Definition qcp.hpp:88
Vec3 apply(const Vec3 &x) const
Definition math.hpp:380