Gemmi C++ API
Loading...
Searching...
No Matches
align.hpp
Go to the documentation of this file.
1// Copyright 2020 Global Phasing Ltd.
2//
3// Sequence alignment, label_seq_id assignment, structure superposition.
4
5#ifndef GEMMI_ALIGN_HPP_
6#define GEMMI_ALIGN_HPP_
7
8#include "model.hpp"
9#include "seqalign.hpp" // for align_sequences
10#include "qcp.hpp" // for superpose_positions
11#include "polyheur.hpp" // for are_connected3
12
13namespace gemmi {
14
15// Sequence alignment and label_seq_id assignment
16
17// helper function for sequence alignment
18inline std::vector<int> prepare_target_gapo(const ConstResidueSpan& polymer,
19 PolymerType polymer_type,
20 const AlignmentScoring* scoring=nullptr) {
21 if (!scoring)
23 std::vector<int> gaps;
24 gaps.reserve(polymer.size());
25 gaps.push_back(0); // free gap opening at the beginning of sequence
26 if (is_polypeptide(polymer_type) || is_polynucleotide(polymer_type)) {
27 auto first_conformer = polymer.first_conformer();
28 auto res = first_conformer.begin();
29 for (auto next_res = res; ++next_res != first_conformer.end(); res = next_res) {
30 bool connected = are_connected3(*res, *next_res, polymer_type);
31 gaps.push_back(connected ? scoring->bad_gapo : scoring->good_gapo);
32 }
33 gaps.push_back(0); // free gap after the end of chain
34 }
35 return gaps;
36}
37
39 const std::vector<std::string>& full_seq,
40 const ConstResidueSpan& polymer,
41 PolymerType polymer_type,
42 const AlignmentScoring* scoring=nullptr) {
43 if (!polymer)
44 return AlignmentResult();
45 std::map<std::string, std::uint8_t> encoding;
46 if (!scoring)
48 for (const std::string& res_name : scoring->matrix_encoding)
49 encoding.emplace(res_name, (std::uint8_t)encoding.size());
50 for (const Residue& res : polymer)
51 encoding.emplace(res.name, (std::uint8_t)encoding.size());
52 for (const std::string& mon_list : full_seq)
53 encoding.emplace(Entity::first_mon(mon_list), (std::uint8_t)encoding.size());
54 if (encoding.size() > 255)
55 return AlignmentResult();
56
57 std::vector<std::uint8_t> encoded_full_seq(full_seq.size());
58 for (size_t i = 0; i != full_seq.size(); ++i)
59 encoded_full_seq[i] = encoding.at(Entity::first_mon(full_seq[i]));
60
61 std::vector<std::uint8_t> encoded_model_seq;
62 encoded_model_seq.reserve(polymer.size());
63 for (const Residue& res : polymer.first_conformer())
64 encoded_model_seq.push_back(encoding.at(res.name));
65
66 return align_sequences(encoded_full_seq, encoded_model_seq,
67 prepare_target_gapo(polymer, polymer_type, scoring),
68 (std::uint8_t)encoding.size(), *scoring);
69}
70
71// check for exact match between model sequence and full sequence (SEQRES)
72inline bool seqid_matches_seqres(const ConstResidueSpan& polymer,
73 const Entity& ent) {
74 if (ent.full_sequence.size() != polymer.size())
75 return false;
76 int idx = 0;
77 for (const Residue& res : polymer) {
78 if (ent.full_sequence[idx] != res.name ||
79 ++idx != *res.seqid.num || res.seqid.has_icode())
80 return false;
81 }
82 return true;
83}
84
85inline void clear_sequences(Structure& st) {
86 for (Entity& ent : st.entities) {
87 ent.full_sequence.clear();
88 ent.dbrefs.clear();
89 ent.sifts_unp_acc.clear();
90 }
91}
92
94void assign_best_sequences(Structure& st, const std::vector<std::string>& fasta_sequences);
95
96// Uses sequence alignment (model to SEQRES) to assign label_seq.
97// force: assign label_seq even if full sequence is not known (assumes no gaps)
99 const Entity* ent, bool force) {
100 AlignmentResult result;
101
102 // sequence not known
103 if (!ent || ent->full_sequence.empty()) {
104 if (!force)
105 return;
106 PolymerType ptype = get_or_check_polymer_type(ent, polymer);
107 const Residue* prev = nullptr;
108 for (const Residue& res : polymer.first_conformer()) {
109 if (prev && !are_connected3(*prev, res, ptype))
110 result.push_cigar(1, 1); // assume a single insertion
111 result.push_cigar(0, 1);
112 prev = &res;
113 }
114
115 // exact match - common case that doesn't require alignment
116 } else if (seqid_matches_seqres(polymer, *ent)) {
117 result.push_cigar(0, (int)ent->full_sequence.size());
118
119 // sequence alignment
120 } else {
121 PolymerType ptype = get_or_check_polymer_type(ent, polymer);
122 result = align_sequence_to_polymer(ent->full_sequence, polymer, ptype);
123 }
124
125 auto res_group = polymer.first_conformer().begin();
126 int id = 1;
127 for (AlignmentResult::Item item : result.cigar) {
128 switch (item.op()) {
129 case 'I':
130 id += item.len();
131 break;
132 case 'D': // leaving label_seq as it is
133 for (uint32_t i = 0; i < item.len(); ++i)
134 res_group++;
135 break;
136 case 'M': // not checking for mismatches
137 for (uint32_t i = 0; i < item.len(); ++i, ++id)
138 for (Residue* res = &*res_group++; res != &*res_group; ++res)
139 res->label_seq = id;
140 break;
141 }
142 }
143}
144
146 for (Model& model : st.models)
147 for (Chain& chain : model.chains)
148 for (Residue& res : chain.residues)
149 res.label_seq = Residue::OptionalNum();
150}
151
152inline void assign_label_seq_id(Structure& st, bool force) {
153 for (Model& model : st.models)
154 for (Chain& chain : model.chains)
155 if (ResidueSpan polymer = chain.get_polymer())
156 if (!polymer.front().label_seq || !polymer.back().label_seq) {
157 const Entity* ent = st.get_entity_of(polymer);
158 assign_label_seq_to_polymer(polymer, ent, force);
159 }
160}
161
162
163// superposition
164
165enum class SupSelect {
166 CaP, // only Ca (aminoacids) or P (nucleotides) atoms
167 MainChain, // only main chain atoms
168 All
169};
170
171inline void prepare_positions_for_superposition(std::vector<Position>& pos1,
172 std::vector<Position>& pos2,
173 ConstResidueSpan fixed,
174 ConstResidueSpan movable,
175 PolymerType ptype,
176 SupSelect sel,
177 char altloc='\0',
178 std::vector<int>* ca_offsets=nullptr) {
180 movable, ptype,
182 auto it1 = fixed.first_conformer().begin();
183 auto it2 = movable.first_conformer().begin();
184 std::vector<AtomNameElement> used_atoms;
185 bool is_na = is_polynucleotide(ptype);
186 const AtomNameElement* ca_p = nullptr;
187 if (sel == SupSelect::CaP) {
188 used_atoms.push_back({is_na ? "P" : "CA", is_na ? El::P : El::C});
189 } else if (sel == SupSelect::MainChain) {
190 used_atoms = get_mainchain_atoms(ptype);
191 ca_p = &used_atoms[is_na ? 0 : 1];
192 }
193 for (AlignmentResult::Item item : result.cigar) {
194 char op = item.op();
195 for (uint32_t i = 0; i < item.len(); ++i) {
196 int ca_offset = -1;
197 if (op == 'M' && it1->name == it2->name) {
198 if (!used_atoms.empty()) {
199 for (const AtomNameElement& ane : used_atoms) {
200 const Atom* a1 = it1->find_atom(ane.atom_name, altloc, ane.el);
201 const Atom* a2 = it2->find_atom(ane.atom_name, altloc, ane.el);
202 if (a1 && a2) {
203 if (&ane == ca_p)
204 ca_offset = (int)pos1.size();
205 pos1.push_back(a1->pos);
206 pos2.push_back(a2->pos);
207 }
208 }
209 } else {
210 for (const Atom& a1 : it1->atoms)
211 if (a1.altloc_matches(altloc))
212 if (const Atom* a2 = it2->find_atom(a1.name, altloc, a1.element)) {
213 pos1.push_back(a1.pos);
214 pos2.push_back(a2->pos);
215 }
216 }
217 }
218 if (op == 'M' || op == 'I') {
219 ++it1;
220 if (ca_offsets)
221 ca_offsets->push_back(ca_offset);
222 }
223 if (op == 'M' || op == 'D')
224 ++it2;
225 }
226 }
227}
228
230 ConstResidueSpan movable,
231 PolymerType ptype,
232 SupSelect sel,
233 char altloc='\0') {
234 std::vector<Position> pos1, pos2;
235 prepare_positions_for_superposition(pos1, pos2, fixed, movable, ptype, sel, altloc);
236 SupResult r;
237 r.count = pos1.size();
238 double sd = 0;
239 for (size_t i = 0; i != pos1.size(); ++i)
240 sd += pos1[i].dist_sq(pos2[i]);
241 r.rmsd = std::sqrt(sd / r.count);
242 return r;
243}
244
246 ConstResidueSpan movable,
247 PolymerType ptype,
248 SupSelect sel,
249 int trim_cycles=0,
250 double trim_cutoff=2.0,
251 char altloc='\0') {
252 std::vector<Position> pos1, pos2;
253 prepare_positions_for_superposition(pos1, pos2, fixed, movable, ptype, sel, altloc);
254 const double* weights = nullptr;
255 size_t len = pos1.size();
256 SupResult sr = superpose_positions(pos1.data(), pos2.data(), len, weights);
257
258 for (int n = 0; n < trim_cycles; ++n) {
259 double max_dist_sq = sq(trim_cutoff * sr.rmsd);
260 size_t p = 0;
261 for (size_t i = 0; i != len; ++i) {
262 Vec3 m2 = sr.transform.apply(pos2[i]);
263 if (m2.dist_sq(pos1[i]) <= max_dist_sq) {
264 if (i != p) {
265 pos1[p] = pos1[i];
266 pos2[p] = pos2[i];
267 }
268 ++p;
269 }
270 }
271 if (p == len)
272 break;
273 len = p;
274 if (len < 3)
275 fail("in calculate_superposition(): only ", std::to_string(len),
276 " atoms after trimming");
277 sr = superpose_positions(pos1.data(), pos2.data(), len, weights);
278 }
279
280 return sr;
281}
282
283// Returns superpositions for all residues in fixed.first_conformer(),
284// performed by superposing backbone in radius=10.0 from residue's Ca.
285inline std::vector<SupResult> calculate_superpositions_in_moving_window(
286 ConstResidueSpan fixed,
287 ConstResidueSpan movable,
288 PolymerType ptype,
289 double radius=10.0) {
290 const double radius_sq = radius * radius;
291 std::vector<Position> pos1, pos2;
292 char altloc = '\0';
294 std::vector<int> ca_offsets;
295 prepare_positions_for_superposition(pos1, pos2, fixed, movable, ptype,
296 sel, altloc, &ca_offsets);
297 const double* weights = nullptr;
298 std::vector<SupResult> result;
299 for (int offset : ca_offsets) {
300 if (offset == -1) {
301 result.push_back(SupResult{NAN, 0, {}, {}, {}});
302 continue;
303 }
304 const Position& ca_pos = pos1[offset];
305 int a = offset;
306 while (a > 0 && ca_pos.dist_sq(pos1[a-1]) < radius_sq)
307 --a;
308 int b = offset;
309 while (b+1 < (int)pos1.size() && ca_pos.dist_sq(pos1[b+1]) < radius_sq)
310 ++b;
311 result.push_back(superpose_positions(&pos1[a], &pos2[a], b-a+1, weights));
312 }
313 return result;
314}
315
316} // namespace gemmi
317#endif
#define GEMMI_DLL
Definition fail.hpp:53
Data structures to store macromolecular structure models.
void clear_sequences(Structure &st)
Definition align.hpp:85
void assign_label_seq_id(Structure &st, bool force)
Definition align.hpp:152
void assign_label_seq_to_polymer(ResidueSpan &polymer, const Entity *ent, bool force)
Definition align.hpp:98
void prepare_positions_for_superposition(std::vector< Position > &pos1, std::vector< Position > &pos2, ConstResidueSpan fixed, ConstResidueSpan movable, PolymerType ptype, SupSelect sel, char altloc='\0', std::vector< int > *ca_offsets=nullptr)
Definition align.hpp:171
std::vector< AtomNameElement > get_mainchain_atoms(PolymerType ptype)
Definition polyheur.hpp:30
SupResult superpose_positions(const Position *pos1, const Position *pos2, size_t len, const double *weight)
Definition qcp.hpp:297
bool is_polypeptide(PolymerType pt)
Definition metadata.hpp:231
std::vector< SupResult > calculate_superpositions_in_moving_window(ConstResidueSpan fixed, ConstResidueSpan movable, PolymerType ptype, double radius=10.0)
Definition align.hpp:285
PolymerType get_or_check_polymer_type(const Entity *ent, const ConstResidueSpan &polymer)
Definition polyheur.hpp:21
GEMMI_DLL void assign_best_sequences(Structure &st, const std::vector< std::string > &fasta_sequences)
void clear_label_seq_id(Structure &st)
Definition align.hpp:145
AlignmentResult align_sequence_to_polymer(const std::vector< std::string > &full_seq, const ConstResidueSpan &polymer, PolymerType polymer_type, const AlignmentScoring *scoring=nullptr)
Definition align.hpp:38
SupResult calculate_current_rmsd(ConstResidueSpan fixed, ConstResidueSpan movable, PolymerType ptype, SupSelect sel, char altloc='\0')
Definition align.hpp:229
AlignmentResult align_sequences(const std::vector< std::uint8_t > &query, const std::vector< std::uint8_t > &target, const std::vector< int > &target_gapo, std::uint8_t m, const AlignmentScoring &scoring)
All values in query and target must be less then m.
Definition seqalign.hpp:195
bool is_polynucleotide(PolymerType pt)
Definition metadata.hpp:235
constexpr float sq(float x)
Definition math.hpp:34
bool seqid_matches_seqres(const ConstResidueSpan &polymer, const Entity &ent)
Definition align.hpp:72
Vec3_< double > Vec3
Definition math.hpp:113
SupResult calculate_superposition(ConstResidueSpan fixed, ConstResidueSpan movable, PolymerType ptype, SupSelect sel, int trim_cycles=0, double trim_cutoff=2.0, char altloc='\0')
Definition align.hpp:245
bool are_connected3(const Residue &r1, const Residue &r2, PolymerType ptype)
are_connected3() = are_connected() + fallback to are_connected2()
Definition polyheur.hpp:86
SupSelect
Definition align.hpp:165
void fail(const std::string &msg)
Definition fail.hpp:59
std::vector< int > prepare_target_gapo(const ConstResidueSpan &polymer, PolymerType polymer_type, const AlignmentScoring *scoring=nullptr)
Definition align.hpp:18
Heuristic methods for working with chains and polymers. Also includes a few well-defined functions,...
Structural superposition, the QCP method.
Simple pairwise sequence alignment.
void push_cigar(std::uint32_t op, int len)
Definition seqalign.hpp:184
std::vector< Item > cigar
Definition seqalign.hpp:80
static const AlignmentScoring * partial_model()
Definition seqalign.hpp:36
static const AlignmentScoring * blosum62()
Definition seqalign.hpp:40
Represents atom site in macromolecular structure (~100 bytes).
Definition model.hpp:120
Position pos
Definition model.hpp:131
std::vector< std::string > extract_sequence() const
Definition model.hpp:340
ConstUniqProxy< Residue, ConstResidueSpan > first_conformer() const
Definition model.hpp:325
std::vector< std::string > full_sequence
SEQRES or entity_poly_seq with microheterogeneity as comma-separated names.
Definition metadata.hpp:260
static std::string first_mon(const std::string &mon_list)
Definition metadata.hpp:264
Coordinates in Angstroms - orthogonal (Cartesian) coordinates.
Definition unitcell.hpp:32
UniqProxy< Residue, ResidueSpan > first_conformer()
Definition model.hpp:403
std::size_t size() const
Definition span.hpp:66
Entity * get_entity_of(const ConstResidueSpan &sub)
Definition model.hpp:960
std::vector< Model > models
Definition model.hpp:889
std::vector< Entity > entities
Definition model.hpp:891
Transform transform
Definition qcp.hpp:90
double rmsd
Definition qcp.hpp:87
size_t count
Definition qcp.hpp:88
Vec3 apply(const Vec3 &x) const
Definition math.hpp:402