Gemmi C++ API
Loading...
Searching...
No Matches
pirfasta.hpp
Go to the documentation of this file.
1// Copyright 2020 Global Phasing Ltd.
2//
3// Read sequence from PIR or (multi-)FASTA format.
4
5#ifndef GEMMI_PIRFASTA_HPP_
6#define GEMMI_PIRFASTA_HPP_
7
8#include <cctype> // for isspace
9#include <vector>
10#include <algorithm> // for min
11#include "fail.hpp"
12
13namespace gemmi {
14
15struct FastaSeq {
16 std::string header;
17 std::string seq;
18};
19
20// PIR format starts with one of: >P1; >F1; >DL; >DC; >RL; >RC; >XX;
21inline bool is_pir_format(const std::string& s) {
22 return s.length() > 4 && s[0] == '>' && s[3] == ';' && (
23 ((s[1] == 'P' || s[1] == 'F') && s[2] == '1') ||
24 ((s[1] == 'D' || s[1] == 'R') && (s[2] == 'L' || s[2] == 'C')) ||
25 (s[1] == 'X' && s[2] == 'X'));
26}
27
28inline std::vector<FastaSeq> read_pir_or_fasta(const std::string& str) {
29 if (str[0] != '>')
30 fail("PIR/FASTA files start with '>'");
31 bool pir = is_pir_format(str);
32 std::vector<FastaSeq> r;
33 int blank_lines = 0;
34 int paren_level = 0;
35 bool ended = false;
36 for (size_t pos=0, end=0; end != std::string::npos; pos = end + 1) {
37 end = str.find('\n', pos);
38 if (str[pos] == '>') {
39 ended = false;
40 if (paren_level != 0)
41 break;
42 r.emplace_back();
43 if (pir && end != std::string::npos)
44 end = str.find('\n', end+1);
45 r.back().header = str.substr(pos+1, end-(pos+1));
46 } else {
47 std::string& seq = r.back().seq;
48 for (size_t i = pos; i < std::min(end, str.size()); ++i) {
49 char c = str[i];
50 if (std::isspace(c)) {
51 if (c == '\n')
53 continue;
54 }
55 // handle non-blank characters
56 if (ended)
57 fail("'*' is interpreted as sequence terminator, here it is followed by: ", c);
58 if (blank_lines >= 2)
59 fail("blank lines can be followed only by a line starting with '>'");
60 blank_lines = 0;
61 if (('a' <= (c | 0x20) && (c | 0x20) <= 'z') || c == '-' ||
62 (paren_level != 0 && '0' <= c && c <= '9')) {
63 // good character, nothing to be done here
64 } else if (c == '*') {
65 ended = true;
66 continue;
67 } else if (c == '(') {
68 if (++paren_level > 1)
69 fail("nested parentheses are not allowed");
70 } else if (c == ')') {
71 if (--paren_level < 0)
72 fail("')' without matching '('");
73 } else {
74 fail("unexpected character in sequence: ", c);
75 }
76 seq += c;
77 }
78 }
79 }
80 if (paren_level != 0)
81 fail("unmatched '('");
82 return r;
83}
84
85} // namespace gemmi
86#endif
bool is_pir_format(const std::string &s)
Definition pirfasta.hpp:21
void fail(const std::string &msg)
Definition fail.hpp:59
std::vector< FastaSeq > read_pir_or_fasta(const std::string &str)
Definition pirfasta.hpp:28
std::string seq
Definition pirfasta.hpp:17
std::string header
Definition pirfasta.hpp:16