Gemmi C++ API
Loading...
Searching...
No Matches
seqalign.hpp
Go to the documentation of this file.
1// Simple pairwise sequence alignment.
2//
3// Code in this file is based on and derived from files ksw2_gg.c and ksw2.h
4// from https://github.com/lh3/ksw2, which is under the MIT license.
5// The original code, written by Heng Li, has more features and has more
6// efficient variants that use SSE instructions.
7
8#ifndef GEMMI_SEQALIGN_HPP_
9#define GEMMI_SEQALIGN_HPP_
10
11#include <cstdint>
12#include <algorithm> // for reverse
13#include <map>
14#include <string>
15#include <vector>
16#include "fail.hpp" // for fail
17
18namespace gemmi {
19
21 int match = 1;
22 int mismatch = -1;
23 int gapo = -1; // gap opening penalty
24 int gape = -1; // gap extension penalty
25 // In a polymer in model, coordinates are used to determine expected gaps.
26 int good_gapo = 0; // gap opening in expected place in a polymer
27 int bad_gapo = -2; // gap opening that was not predicted
28 std::vector<std::int8_t> score_matrix;
29 std::vector<std::string> matrix_encoding;
30
31 static const AlignmentScoring* simple() {
32 static const AlignmentScoring s{};
33 return &s;
34 }
35 // Scoring for alignment of partially-modelled polymer to its full sequence
37 static const AlignmentScoring s = { 100, -10000, -10000, -1, 0, -200, {}, {} };
38 return &s;
39 }
40 static const AlignmentScoring* blosum62() {
41 // BLAST uses BLOSUM-62 with gap cost (10,1)
42 static const AlignmentScoring s = {
43 1, -4, -10, -1, 0, -10,
44 { 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,
45 -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,
46 -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,
47 -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,
48 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,
49 -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,
50 -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,
51 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,
52 -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,
53 -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,
54 -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,
55 -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,
56 -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,
57 -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,
58 -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,
59 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,
60 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,
61 -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,
62 -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,
63 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4},
64 {"ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE",
65 "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR", "TRP", "TYR", "VAL"}
66 };
67 return &s;
68 }
69};
70
72 struct Item {
73 std::uint32_t value;
74 char op() const { return "MID"[value & 0xf]; }
75 std::uint32_t len() const { return value >> 4; }
76 };
77 int score = 0;
78 int match_count = 0;
79 std::string match_string;
80 std::vector<Item> cigar;
81
82 std::string cigar_str() const {
83 std::string s;
84 for (Item item : cigar) {
85 s += std::to_string(item.len());
86 s += item.op();
87 }
88 return s;
89 }
90
91 // 1=query, 2=target, other=shorter
92 std::size_t input_length(int which) const {
93 std::size_t counters[3] = {0, 0, 0};
94 for (Item item : cigar)
95 counters[item.value & 0xf] += item.len();
96 if (which == 1 || which == 2)
97 return counters[0] + counters[which];
98 return counters[0] + std::min(counters[1], counters[2]);
99 }
100 double calculate_identity(int which=0) const {
101 return 100. * match_count / input_length(which);
102 }
103
104 // In the backtrack matrix, value p[] has the following structure:
105 // bit 0-2: which type gets the max - 0 for H, 1 for E, 2 for F
106 // bit 3/0x08: 1 if a continuation on the E state
107 // bit 4/0x10: 1 if a continuation on the F state
108 void backtrack_to_cigar(const std::uint8_t *p, int i, int j) {
109 i--;
110 int j0 = j--;
111 int state = 0;
112 while (i >= 0 && j >= 0) {
113 // at the beginning of the loop, _state_ tells us which state to check
114 // if requesting the H state, find state one maximizes it.
115 uint32_t tmp = p[(std::size_t)i * j0 + j];
116 if (state == 0 || (tmp & (1 << (state + 2))) == 0)
117 state = tmp & 7;
118 if (state == 0) { // match
119 push_cigar(0, 1);
120 --i;
121 --j;
122 } else if (state == 1) { // deletion
123 push_cigar(2, 1);
124 --i;
125 } else { // insertion
126 push_cigar(1, 1);
127 --j;
128 }
129 }
130 if (i >= 0)
131 push_cigar(2, i + 1); // first deletion
132 else if (j >= 0)
133 push_cigar(1, j + 1); // first insertion
134 std::reverse(cigar.begin(), cigar.end());
135 }
136
137
138 void count_matches(const std::vector<std::uint8_t>& query,
139 const std::vector<std::uint8_t>& target) {
140 match_count = 0;
141 size_t pos1 = 0, pos2 = 0;
142 for (Item item : cigar)
143 if (item.op() == 'M') {
144 for (uint32_t i = 0; i < item.len(); ++i)
145 if (query[pos1++] == target[pos2++]) {
146 ++match_count;
147 match_string += '|';
148 } else {
149 match_string += '.';
150 }
151 } else if (item.op() == 'I') {
152 pos1 += item.len();
153 match_string.append(item.len(), ' ');
154 } else /*item.op() == 'D'*/ {
155 pos2 += item.len();
156 match_string.append(item.len(), ' ');
157 }
158 }
159
160 std::string add_gaps(const std::string& s, unsigned which) const {
161 std::string out;
162 size_t pos = 0;
163 for (Item item : cigar) {
164 bool show = (item.value & 0xf) == 0 || (item.value & 0xf) == which;
165 for (uint32_t i = 0; i < item.len(); ++i)
166 out += show ? s.at(pos++) : '-';
167 }
168 return out;
169 }
170
171 std::string formatted(const std::string& a, const std::string& b) const {
172 std::string r;
173 r.reserve((match_string.size() + 1) * 3);
174 r += add_gaps(a, 1);
175 r += '\n';
176 r += match_string;
177 r += '\n';
178 r += add_gaps(b, 2);
179 r += '\n';
180 return r;
181 }
182
183 // op: 0=match/mismatch, 1=insertion, 2=deletion
184 void push_cigar(std::uint32_t op, int len) {
185 if (cigar.empty() || op != (cigar.back().value & 0xf))
186 cigar.push_back({len<<4 | op});
187 else
188 cigar.back().value += len<<4;
189 }
190};
191
194inline
195AlignmentResult align_sequences(const std::vector<std::uint8_t>& query,
196 const std::vector<std::uint8_t>& target,
197 const std::vector<int>& target_gapo,
198 std::uint8_t m,
199 const AlignmentScoring& scoring) {
200 // generate the query profile
201 std::int16_t *query_profile = new std::int16_t[query.size() * m];
202 {
203 std::uint32_t mat_size = (std::uint32_t) scoring.matrix_encoding.size();
204 if (mat_size * mat_size != scoring.score_matrix.size())
205 fail("align_sequences: internal error (wrong score_matrix)");
206 std::int32_t i = 0;
207 for (std::uint8_t k = 0; k < m; ++k)
208 for (std::uint8_t q : query)
209 if (k < mat_size && q < mat_size)
210 query_profile[i++] = scoring.score_matrix[k * mat_size + q];
211 else
212 query_profile[i++] = (k == q ? scoring.match : scoring.mismatch);
213 }
214
215 struct eh_t { std::int32_t h, e; };
216 eh_t *eh = new eh_t[query.size() + 1];
217 std::int32_t gape = scoring.gape;
218 std::int32_t gapoe = scoring.gapo + gape;
219
220 // fill the first row
221 {
222 std::int32_t gap0 = !target_gapo.empty() ? target_gapo[0] + gape : gapoe;
223 eh[0].h = 0;
224 eh[0].e = gap0 + gapoe;
225 for (std::int32_t j = 1; j <= (std::int32_t)query.size(); ++j) {
226 eh[j].h = gap0 + gape * (j - 1);
227 eh[j].e = gap0 + gapoe + gape * j;
228 }
229 }
230
231 // backtrack matrix; in each cell: f<<4|e<<2|h
232 std::uint8_t *z = new std::uint8_t[query.size() * target.size()];
233 // DP loop
234 for (std::int32_t i = 0; i < (std::int32_t)target.size(); ++i) {
235 std::uint8_t target_item = target[i];
236 std::int16_t *scores = &query_profile[target_item * query.size()];
237 std::uint8_t *zi = &z[i * query.size()];
238 std::int32_t h1 = gapoe + gape * i;
239 std::int32_t f = gapoe + gapoe + gape * i;
240 std::int32_t gapx = i+1 < (std::int32_t)target_gapo.size()
241 ? target_gapo[i+1] + gape : gapoe;
242 for (std::size_t j = 0; j < query.size(); ++j) {
243 // At the beginning of the loop:
244 // eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
245 // Cells are computed in the following order:
246 // H(i,j) = max{H(i-1,j-1) + S(i,j), E(i,j), F(i,j)}
247 // E(i+1,j) = max{H(i,j)+gapo, E(i,j)} + gape
248 // F(i,j+1) = max{H(i,j)+gapo, F(i,j)} + gape
249 eh_t *p = &eh[j];
250 std::int32_t h = p->h;
251 std::int32_t e = p->e;
252 p->h = h1;
253 h += scores[j];
254 std::uint8_t direction = 0;
255 if (h < e) {
256 direction = 1; // deletion
257 h = e;
258 }
259 if (h <= f) {
260 direction = 2; // insertion
261 h = f;
262 }
263 h1 = h;
264
265 h += gapoe;
266 e += gape;
267 if (e > h)
268 direction |= 0x08;
269 else
270 e = h;
271
272 h = h1 + gapx;
273 p->e = e;
274 f += gape;
275 if (f > h)
276 direction |= 0x10;
277 else
278 f = h;
279
280 // z[i,j] keeps h for the current cell and e/f for the next cell
281 zi[j] = direction;
282 }
283 eh[query.size()].h = h1;
284 eh[query.size()].e = -0x40000000; // -infinity
285 }
286
288 result.score = eh[query.size()].h;
289 delete [] query_profile;
290 delete [] eh;
291 result.backtrack_to_cigar(z, (int)target.size(), (int)query.size());
292 delete [] z;
293 result.count_matches(query, target);
294 return result;
295}
296
297inline
298AlignmentResult align_string_sequences(const std::vector<std::string>& query,
299 const std::vector<std::string>& target,
300 const std::vector<int>& target_gapo,
301 const AlignmentScoring* scoring) {
302 if (scoring == nullptr)
304 std::map<std::string, std::uint8_t> encoding;
305 for (const std::string& res_name : scoring->matrix_encoding)
306 encoding.emplace(res_name, (std::uint8_t)encoding.size());
307 for (const std::string& s : query)
308 encoding.emplace(s, (std::uint8_t)encoding.size());
309 for (const std::string& s : target)
310 encoding.emplace(s, (std::uint8_t)encoding.size());
311 if (encoding.size() > 255)
312 return AlignmentResult();
313 std::vector<std::uint8_t> encoded_query(query.size());
314 for (size_t i = 0; i != query.size(); ++i)
316 std::vector<std::uint8_t> encoded_target(target.size());
317 for (size_t i = 0; i != target.size(); ++i)
320 target_gapo, (std::uint8_t)encoding.size(), *scoring);
321}
322
323} // namespace gemmi
324#endif
fail(), unreachable() and __declspec/__attribute__ macros
AlignmentResult align_sequences(const std::vector< std::uint8_t > &query, const std::vector< std::uint8_t > &target, const std::vector< int > &target_gapo, std::uint8_t m, const AlignmentScoring &scoring)
All values in query and target must be less then m.
Definition seqalign.hpp:195
void fail(const std::string &msg)
Definition fail.hpp:59
AlignmentResult align_string_sequences(const std::vector< std::string > &query, const std::vector< std::string > &target, const std::vector< int > &target_gapo, const AlignmentScoring *scoring)
Definition seqalign.hpp:298
std::uint32_t len() const
Definition seqalign.hpp:75
double calculate_identity(int which=0) const
Definition seqalign.hpp:100
std::string formatted(const std::string &a, const std::string &b) const
Definition seqalign.hpp:171
void count_matches(const std::vector< std::uint8_t > &query, const std::vector< std::uint8_t > &target)
Definition seqalign.hpp:138
std::string cigar_str() const
Definition seqalign.hpp:82
std::string match_string
Definition seqalign.hpp:79
void backtrack_to_cigar(const std::uint8_t *p, int i, int j)
Definition seqalign.hpp:108
void push_cigar(std::uint32_t op, int len)
Definition seqalign.hpp:184
std::size_t input_length(int which) const
Definition seqalign.hpp:92
std::vector< Item > cigar
Definition seqalign.hpp:80
std::string add_gaps(const std::string &s, unsigned which) const
Definition seqalign.hpp:160
static const AlignmentScoring * partial_model()
Definition seqalign.hpp:36
std::vector< std::string > matrix_encoding
Definition seqalign.hpp:29
static const AlignmentScoring * simple()
Definition seqalign.hpp:31
std::vector< std::int8_t > score_matrix
Definition seqalign.hpp:28
static const AlignmentScoring * blosum62()
Definition seqalign.hpp:40