5#ifndef GEMMI_ALIGN_HPP_
6#define GEMMI_ALIGN_HPP_
23 std::vector<int> gaps;
24 gaps.reserve(polymer.
size());
28 auto res = first_conformer.begin();
29 for (
auto next_res = res; ++next_res != first_conformer.end(); res = next_res) {
31 gaps.push_back(connected ? scoring->bad_gapo : scoring->good_gapo);
40 const std::vector<std::string>& full_seq,
44 std::map<std::string, std::uint8_t> encoding;
47 for (
const std::string& res_name : scoring->matrix_encoding)
48 encoding.emplace(res_name, (std::uint8_t)encoding.size());
49 for (
const Residue& res : polymer)
50 encoding.emplace(res.name, (std::uint8_t)encoding.size());
51 for (
const std::string& mon_list : full_seq)
53 if (encoding.size() > 255)
56 std::vector<std::uint8_t> encoded_full_seq(full_seq.size());
57 for (
size_t i = 0; i != full_seq.size(); ++i)
60 std::vector<std::uint8_t> encoded_model_seq;
61 encoded_model_seq.reserve(polymer.
size());
63 encoded_model_seq.push_back(encoding.at(res.name));
67 (std::uint8_t)encoding.size(), *scoring);
76 for (
const Residue& res : polymer) {
78 ++idx != *res.seqid.num || res.seqid.has_icode())
86 ent.full_sequence.clear();
88 ent.sifts_unp_acc.clear();
98 const Entity* ent,
bool force) {
132 for (uint32_t i = 0; i < item.len(); ++i)
136 for (uint32_t i = 0; i < item.len(); ++i, ++id)
137 for (
Residue* res = &*res_group++; res != &*res_group; ++res)
146 for (
Chain& chain : model.chains)
147 for (
Residue& res : chain.residues)
153 for (
Chain& chain : model.chains)
155 if (!polymer.front().label_seq || !polymer.back().label_seq) {
171 std::vector<Position>& pos2,
177 std::vector<int>* ca_offsets=
nullptr) {
183 std::vector<AtomNameElement> used_atoms;
187 used_atoms.push_back({is_na ?
"P" :
"CA", is_na ?
El::P :
El::C});
190 ca_p = &used_atoms[is_na ? 0 : 1];
194 for (uint32_t i = 0; i < item.len(); ++i) {
196 if (op ==
'M' && it1->name == it2->name) {
197 if (!used_atoms.empty()) {
199 const Atom* a1 = it1->find_atom(ane.atom_name, altloc, ane.el);
200 const Atom* a2 = it2->find_atom(ane.atom_name, altloc, ane.el);
203 ca_offset = (int)pos1.size();
204 pos1.push_back(a1->
pos);
205 pos2.push_back(a2->
pos);
209 for (
const Atom& a1 : it1->atoms)
210 if (a1.altloc_matches(altloc))
211 if (
const Atom* a2 = it2->find_atom(a1.name, altloc, a1.element)) {
212 pos1.push_back(a1.pos);
213 pos2.push_back(a2->pos);
217 if (op ==
'M' || op ==
'I') {
220 ca_offsets->push_back(ca_offset);
222 if (op ==
'M' || op ==
'D')
233 std::vector<Position> pos1, pos2;
236 r.
count = pos1.size();
238 for (
size_t i = 0; i != pos1.size(); ++i)
239 sd += pos1[i].dist_sq(pos2[i]);
249 double trim_cutoff=2.0,
251 std::vector<Position> pos1, pos2;
253 const double* weights =
nullptr;
254 size_t len = pos1.size();
257 for (
int n = 0; n < trim_cycles; ++n) {
258 double max_dist_sq =
sq(trim_cutoff * sr.
rmsd);
260 for (
size_t i = 0; i != len; ++i) {
262 if (m2.dist_sq(pos1[i]) <= max_dist_sq) {
274 fail(
"in calculate_superposition(): only ", std::to_string(len),
275 " atoms after trimming");
288 double radius=10.0) {
289 const double radius_sq = radius * radius;
290 std::vector<Position> pos1, pos2;
293 std::vector<int> ca_offsets;
295 sel, altloc, &ca_offsets);
296 const double* weights =
nullptr;
297 std::vector<SupResult> result;
298 for (
int offset : ca_offsets) {
300 result.push_back(
SupResult{NAN, 0, {}, {}, {}});
303 const Position& ca_pos = pos1[offset];
305 while (a > 0 && ca_pos.dist_sq(pos1[a-1]) < radius_sq)
308 while (b+1 < (
int)pos1.size() && ca_pos.dist_sq(pos1[b+1]) < radius_sq)
void clear_sequences(Structure &st)
void assign_label_seq_id(Structure &st, bool force)
void assign_label_seq_to_polymer(ResidueSpan &polymer, const Entity *ent, bool force)
void prepare_positions_for_superposition(std::vector< Position > &pos1, std::vector< Position > &pos2, ConstResidueSpan fixed, ConstResidueSpan movable, PolymerType ptype, SupSelect sel, char altloc='\0', std::vector< int > *ca_offsets=nullptr)
std::vector< AtomNameElement > get_mainchain_atoms(PolymerType ptype)
SupResult superpose_positions(const Position *pos1, const Position *pos2, size_t len, const double *weight)
bool is_polypeptide(PolymerType pt)
std::vector< SupResult > calculate_superpositions_in_moving_window(ConstResidueSpan fixed, ConstResidueSpan movable, PolymerType ptype, double radius=10.0)
PolymerType get_or_check_polymer_type(const Entity *ent, const ConstResidueSpan &polymer)
GEMMI_DLL void assign_best_sequences(Structure &st, const std::vector< std::string > &fasta_sequences)
void clear_label_seq_id(Structure &st)
AlignmentResult align_sequence_to_polymer(const std::vector< std::string > &full_seq, const ConstResidueSpan &polymer, PolymerType polymer_type, const AlignmentScoring *scoring=nullptr)
SupResult calculate_current_rmsd(ConstResidueSpan fixed, ConstResidueSpan movable, PolymerType ptype, SupSelect sel, char altloc='\0')
AlignmentResult align_sequences(const std::vector< std::uint8_t > &query, const std::vector< std::uint8_t > &target, const std::vector< int > &target_gapo, std::uint8_t m, const AlignmentScoring &scoring)
All values in query and target must be less then m.
bool is_polynucleotide(PolymerType pt)
constexpr float sq(float x)
bool seqid_matches_seqres(const ConstResidueSpan &polymer, const Entity &ent)
SupResult calculate_superposition(ConstResidueSpan fixed, ConstResidueSpan movable, PolymerType ptype, SupSelect sel, int trim_cycles=0, double trim_cutoff=2.0, char altloc='\0')
bool are_connected3(const Residue &r1, const Residue &r2, PolymerType ptype)
are_connected3() = are_connected() + fallback to are_connected2()
void fail(const std::string &msg)
std::vector< int > prepare_target_gapo(const ConstResidueSpan &polymer, PolymerType polymer_type, const AlignmentScoring *scoring=nullptr)
void push_cigar(std::uint32_t op, int len)
std::vector< Item > cigar
static const AlignmentScoring * partial_model()
static const AlignmentScoring * blosum62()
Represents atom site in macromolecular structure (~100 bytes).
std::vector< std::string > extract_sequence() const
ConstUniqProxy< Residue, ConstResidueSpan > first_conformer() const
std::vector< std::string > full_sequence
SEQRES or entity_poly_seq with microheterogeneity as comma-separated names.
static std::string first_mon(const std::string &mon_list)
Coordinates in Angstroms - orthogonal (Cartesian) coordinates.
UniqProxy< Residue, ResidueSpan > first_conformer()
Entity * get_entity_of(const ConstResidueSpan &sub)
std::vector< Model > models
std::vector< Entity > entities