Gemmi C++ API
Loading...
Searching...
No Matches
seqalign.hpp
Go to the documentation of this file.
1// Simple pairwise sequence alignment.
2//
3// Code in this file is based on and derived from files ksw2_gg.c and ksw2.h
4// from https://github.com/lh3/ksw2, which is under the MIT license.
5// The original code, written by Heng Li, has more features and has more
6// efficient variants that use SSE instructions.
7
8#ifndef GEMMI_SEQALIGN_HPP_
9#define GEMMI_SEQALIGN_HPP_
10
11#include <cstdint>
12#include <algorithm> // for reverse
13#include <map>
14#include <string>
15#include <vector>
16
17namespace gemmi {
18
20 int match;
22 int gapo; // gap opening penalty
23 int gape; // gap extension penalty
24 // In a polymer in model, coordinates are used to determine expected gaps.
25 int good_gapo; // gap opening in expected place in a polymer
26 int bad_gapo; // gap opening that was not predicted
27 std::vector<std::int8_t> score_matrix;
28 std::vector<std::string> matrix_encoding;
29
30 static const AlignmentScoring* simple() {
31 static const AlignmentScoring s = { 1, -1, -1, -1, 0, -2, {}, {} };
32 return &s;
33 }
34 // Scoring for alignment of partially-modelled polymer to its full sequence
36 static const AlignmentScoring s = { 100, -10000, -10000, -1, 0, -200, {}, {} };
37 return &s;
38 }
39 static const AlignmentScoring* blosum62() {
40 // BLAST uses BLOSUM-62 with gap cost (10,1)
41 static const AlignmentScoring s = {
42 1, -4, -10, -1, 0, -10,
43 { 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,
44 -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,
45 -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,
46 -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,
47 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,
48 -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,
49 -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,
50 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,
51 -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,
52 -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,
53 -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,
54 -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,
55 -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,
56 -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,
57 -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,
58 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,
59 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,
60 -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,
61 -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,
62 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4},
63 {"ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE",
64 "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR", "TRP", "TYR", "VAL"}
65 };
66 return &s;
67 }
68};
69
71 struct Item {
72 std::uint32_t value;
73 char op() const { return "MID"[value & 0xf]; }
74 std::uint32_t len() const { return value >> 4; }
75 };
76 int score = 0;
77 int match_count = 0;
78 std::string match_string;
79 std::vector<Item> cigar;
80
81 std::string cigar_str() const {
82 std::string s;
83 for (Item item : cigar) {
84 s += std::to_string(item.len());
85 s += item.op();
86 }
87 return s;
88 }
89
90 // 1=query, 2=target, other=shorter
91 std::size_t input_length(int which) const {
92 std::size_t counters[3] = {0, 0, 0};
93 for (Item item : cigar)
94 counters[item.value & 0xf] += item.len();
95 if (which == 1 || which == 2)
96 return counters[0] + counters[which];
97 return counters[0] + std::min(counters[1], counters[2]);
98 }
99 double calculate_identity(int which=0) const {
100 return 100. * match_count / input_length(which);
101 }
102
103 // In the backtrack matrix, value p[] has the following structure:
104 // bit 0-2: which type gets the max - 0 for H, 1 for E, 2 for F
105 // bit 3/0x08: 1 if a continuation on the E state
106 // bit 4/0x10: 1 if a continuation on the F state
107 void backtrack_to_cigar(const std::uint8_t *p, int i, int j) {
108 i--;
109 int j0 = j--;
110 int state = 0;
111 while (i >= 0 && j >= 0) {
112 // at the beginning of the loop, _state_ tells us which state to check
113 // if requesting the H state, find state one maximizes it.
114 uint32_t tmp = p[(std::size_t)i * j0 + j];
115 if (state == 0 || (tmp & (1 << (state + 2))) == 0)
116 state = tmp & 7;
117 if (state == 0) { // match
118 push_cigar(0, 1);
119 --i;
120 --j;
121 } else if (state == 1) { // deletion
122 push_cigar(2, 1);
123 --i;
124 } else { // insertion
125 push_cigar(1, 1);
126 --j;
127 }
128 }
129 if (i >= 0)
130 push_cigar(2, i + 1); // first deletion
131 else if (j >= 0)
132 push_cigar(1, j + 1); // first insertion
133 std::reverse(cigar.begin(), cigar.end());
134 }
135
136
137 void count_matches(const std::vector<std::uint8_t>& query,
138 const std::vector<std::uint8_t>& target) {
139 match_count = 0;
140 size_t pos1 = 0, pos2 = 0;
141 for (Item item : cigar)
142 if (item.op() == 'M') {
143 for (uint32_t i = 0; i < item.len(); ++i)
144 if (query[pos1++] == target[pos2++]) {
145 ++match_count;
146 match_string += '|';
147 } else {
148 match_string += '.';
149 }
150 } else if (item.op() == 'I') {
151 pos1 += item.len();
152 match_string.append(item.len(), ' ');
153 } else /*item.op() == 'D'*/ {
154 pos2 += item.len();
155 match_string.append(item.len(), ' ');
156 }
157 }
158
159 std::string add_gaps(const std::string& s, unsigned which) const {
160 std::string out;
161 size_t pos = 0;
162 for (Item item : cigar) {
163 bool show = (item.value & 0xf) == 0 || (item.value & 0xf) == which;
164 for (uint32_t i = 0; i < item.len(); ++i)
165 out += show ? s.at(pos++) : '-';
166 }
167 return out;
168 }
169
170 std::string formatted(const std::string& a, const std::string& b) const {
171 std::string r;
172 r.reserve((match_string.size() + 1) * 3);
173 r += add_gaps(a, 1);
174 r += '\n';
175 r += match_string;
176 r += '\n';
177 r += add_gaps(b, 2);
178 r += '\n';
179 return r;
180 }
181
182 // op: 0=match/mismatch, 1=insertion, 2=deletion
183 void push_cigar(std::uint32_t op, int len) {
184 if (cigar.empty() || op != (cigar.back().value & 0xf))
185 cigar.push_back({len<<4 | op});
186 else
187 cigar.back().value += len<<4;
188 }
189};
190
193inline
194AlignmentResult align_sequences(const std::vector<std::uint8_t>& query,
195 const std::vector<std::uint8_t>& target,
196 const std::vector<int>& target_gapo,
197 std::uint8_t m,
198 const AlignmentScoring& scoring) {
199 // generate the query profile
200 std::int16_t *query_profile = new std::int16_t[query.size() * m];
201 {
202 std::uint32_t mat_size = (std::uint32_t) scoring.matrix_encoding.size();
203 std::int32_t i = 0;
204 for (std::uint8_t k = 0; k < m; ++k)
205 for (std::uint8_t q : query)
206 if (k < mat_size && q < mat_size)
207 query_profile[i++] = scoring.score_matrix[k * m + q];
208 else
209 query_profile[i++] = (k == q ? scoring.match : scoring.mismatch);
210 }
211
212 struct eh_t { std::int32_t h, e; };
213 eh_t *eh = new eh_t[query.size() + 1];
214 std::int32_t gape = scoring.gape;
215 std::int32_t gapoe = scoring.gapo + gape;
216
217 // fill the first row
218 {
219 std::int32_t gap0 = !target_gapo.empty() ? target_gapo[0] + gape : gapoe;
220 eh[0].h = 0;
221 eh[0].e = gap0 + gapoe;
222 for (std::int32_t j = 1; j <= (std::int32_t)query.size(); ++j) {
223 eh[j].h = gap0 + gape * (j - 1);
224 eh[j].e = gap0 + gapoe + gape * j;
225 }
226 }
227
228 // backtrack matrix; in each cell: f<<4|e<<2|h
229 std::uint8_t *z = new std::uint8_t[query.size() * target.size()];
230 // DP loop
231 for (std::int32_t i = 0; i < (std::int32_t)target.size(); ++i) {
232 std::uint8_t target_item = target[i];
233 std::int16_t *scores = &query_profile[target_item * query.size()];
234 std::uint8_t *zi = &z[i * query.size()];
235 std::int32_t h1 = gapoe + gape * i;
236 std::int32_t f = gapoe + gapoe + gape * i;
237 std::int32_t gapx = i+1 < (std::int32_t)target_gapo.size()
238 ? target_gapo[i+1] + gape : gapoe;
239 for (std::size_t j = 0; j < query.size(); ++j) {
240 // At the beginning of the loop:
241 // eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
242 // Cells are computed in the following order:
243 // H(i,j) = max{H(i-1,j-1) + S(i,j), E(i,j), F(i,j)}
244 // E(i+1,j) = max{H(i,j)+gapo, E(i,j)} + gape
245 // F(i,j+1) = max{H(i,j)+gapo, F(i,j)} + gape
246 eh_t *p = &eh[j];
247 std::int32_t h = p->h;
248 std::int32_t e = p->e;
249 p->h = h1;
250 h += scores[j];
251 std::uint8_t direction = 0;
252 if (h < e) {
253 direction = 1; // deletion
254 h = e;
255 }
256 if (h <= f) {
257 direction = 2; // insertion
258 h = f;
259 }
260 h1 = h;
261
262 h += gapoe;
263 e += gape;
264 if (e > h)
265 direction |= 0x08;
266 else
267 e = h;
268
269 h = h1 + gapx;
270 p->e = e;
271 f += gape;
272 if (f > h)
273 direction |= 0x10;
274 else
275 f = h;
276
277 // z[i,j] keeps h for the current cell and e/f for the next cell
278 zi[j] = direction;
279 }
280 eh[query.size()].h = h1;
281 eh[query.size()].e = -0x40000000; // -infinity
282 }
283
285 result.score = eh[query.size()].h;
286 delete [] query_profile;
287 delete [] eh;
288 result.backtrack_to_cigar(z, (int)target.size(), (int)query.size());
289 delete [] z;
290 result.count_matches(query, target);
291 return result;
292}
293
294inline
295AlignmentResult align_string_sequences(const std::vector<std::string>& query,
296 const std::vector<std::string>& target,
297 const std::vector<int>& target_gapo,
298 const AlignmentScoring* scoring) {
299 if (scoring == nullptr)
301 std::map<std::string, std::uint8_t> encoding;
302 for (const std::string& res_name : scoring->matrix_encoding)
303 encoding.emplace(res_name, (std::uint8_t)encoding.size());
304 for (const std::string& s : query)
305 encoding.emplace(s, (std::uint8_t)encoding.size());
306 for (const std::string& s : target)
307 encoding.emplace(s, (std::uint8_t)encoding.size());
308 if (encoding.size() > 255)
309 return AlignmentResult();
310 std::vector<std::uint8_t> encoded_query(query.size());
311 for (size_t i = 0; i != query.size(); ++i)
313 std::vector<std::uint8_t> encoded_target(target.size());
314 for (size_t i = 0; i != target.size(); ++i)
317 target_gapo, (std::uint8_t)encoding.size(), *scoring);
318}
319
320} // namespace gemmi
321#endif
AlignmentResult align_sequences(const std::vector< std::uint8_t > &query, const std::vector< std::uint8_t > &target, const std::vector< int > &target_gapo, std::uint8_t m, const AlignmentScoring &scoring)
All values in query and target must be less then m.
Definition seqalign.hpp:194
AlignmentResult align_string_sequences(const std::vector< std::string > &query, const std::vector< std::string > &target, const std::vector< int > &target_gapo, const AlignmentScoring *scoring)
Definition seqalign.hpp:295
std::uint32_t len() const
Definition seqalign.hpp:74
double calculate_identity(int which=0) const
Definition seqalign.hpp:99
std::string formatted(const std::string &a, const std::string &b) const
Definition seqalign.hpp:170
void count_matches(const std::vector< std::uint8_t > &query, const std::vector< std::uint8_t > &target)
Definition seqalign.hpp:137
std::string cigar_str() const
Definition seqalign.hpp:81
std::string match_string
Definition seqalign.hpp:78
void backtrack_to_cigar(const std::uint8_t *p, int i, int j)
Definition seqalign.hpp:107
void push_cigar(std::uint32_t op, int len)
Definition seqalign.hpp:183
std::size_t input_length(int which) const
Definition seqalign.hpp:91
std::vector< Item > cigar
Definition seqalign.hpp:79
std::string add_gaps(const std::string &s, unsigned which) const
Definition seqalign.hpp:159
static const AlignmentScoring * partial_model()
Definition seqalign.hpp:35
std::vector< std::string > matrix_encoding
Definition seqalign.hpp:28
static const AlignmentScoring * simple()
Definition seqalign.hpp:30
std::vector< std::int8_t > score_matrix
Definition seqalign.hpp:27
static const AlignmentScoring * blosum62()
Definition seqalign.hpp:39