Gemmi C++ API
Loading...
Searching...
No Matches
dirwalk.hpp
Go to the documentation of this file.
1// Copyright 2018 Global Phasing Ltd.
2//
3// Classes for iterating files in a directory tree, top-down,
4// in an alphabetical order. It wraps the tinydir library (as we cannot
5// depend on C++17 <filesystem> yet).
6
7// DirWalk<> iterates through all files and directories.
8// CifWalk yields only cif files (either files that end with .cif or .cif.gz,
9// or files that look like SF mmCIF files from wwPDB, e.g. r3aaasf.ent.gz).
10// It's good for traversing a local copy of the wwPDB archive.
11// PdbWalk: .pdb or .ent (optionally with .gz) except r????sf.ent
12// CoorFileWalk: .cif, .pdb or .ent (optionally with .gz)
13// except r????sf.ent and *-sf.cif
14//
15// Usage:
16// for (const std::string& file : gemmi::DirWalk<>(top_dir))
17// do_something(file);
18// or
19// for (const std::string& file : gemmi::CifWalk(top_dir))
20// do_something(file);
21// You should also catch std::runtime_error.
22
23#ifndef GEMMI_DIRWALK_HPP_
24#define GEMMI_DIRWALK_HPP_
25
26#include <string>
27#include <vector>
28#include <cassert>
29#if defined(_MSC_VER) && !defined(NOMINMAX)
30# define NOMINMAX
31#endif
32#include "third_party/tinydir.h"
33
34#include "util.hpp" // for giends_with
35#include "fail.hpp" // for sys_fail
36#include "pdb_id.hpp" // for is_pdb_code, expand_pdb_code_to_path
37#if defined(_WIN32) && defined(_UNICODE)
38 #include "utf.hpp"
39#endif
40
41namespace gemmi {
42
43inline std::string as_utf8(const _tinydir_char_t* path) {
44#if defined(_WIN32) && defined(_UNICODE)
45 return wchar_to_UTF8(path);
46#else
47 return path;
48#endif
49}
50
51// linear-time glob matching: https://research.swtch.com/glob
52inline bool glob_match(const std::string& pattern, const std::string& str) {
53 size_t pat_next = 0;
54 size_t str_next = std::string::npos;
55 size_t pat_pos = 0;
56 size_t str_pos = 0;
57 while (pat_pos < pattern.size() || str_pos < str.size()) {
58 if (pat_pos < pattern.size()) {
59 char c = pattern[pat_pos];
60 if (c == '*') {
62 str_next = str_pos + 1;
63 pat_pos++;
64 continue;
65 } else if (str_pos < str.size() && (c == '?' || c == str[str_pos])) {
66 pat_pos++;
67 str_pos++;
68 continue;
69 }
70 }
71 if (str_next > str.size())
72 return false;
75 }
76 return true;
77}
78
79
80namespace impl {
81// the SF mmCIF files from PDB have names such as
82// divided/structure_factors/aa/r3aaasf.ent.gz
83inline bool is_rxsf_ent_filename(const std::string& filename) {
84 return filename[0] == 'r' && giends_with(filename, "sf.ent")
85 && filename.find('.') >= 4;
86}
87
88struct IsMmCifFile { // actually we don't know what kind of cif file it is
89 static bool check(const std::string& filename) {
90 return giends_with(filename, ".cif") || giends_with(filename, ".mmcif");
91 }
92};
93
94struct IsCifFile {
95 static bool check(const std::string& filename) {
96 return giends_with(filename, ".cif") || is_rxsf_ent_filename(filename);
97 }
98};
99
100struct IsPdbFile {
101 static bool check(const std::string& filename) {
102 return giends_with(filename, ".pdb") ||
103 (giends_with(filename, ".ent") && !is_rxsf_ent_filename(filename));
104 }
105};
106
107struct IsCoordinateFile {
108 static bool check(const std::string& filename) {
109 // the SF mmCIF files from RCSB website have names such as 3AAA-sf.cif
110 return IsPdbFile::check(filename) ||
111 (IsMmCifFile::check(filename) && !giends_with(filename, "-sf.cif"));
112 }
113};
114
115struct IsAnyFile {
116 static bool check(const std::string&) { return true; }
117};
118
119struct IsMatchingFile {
120 bool check(const std::string& filename) const {
121 return glob_match(pattern, filename);
122 }
123 std::string pattern;
124};
125
126inline int utf8_tinydir_file_open(tinydir_file* file, const char* path) {
127#if defined(_WIN32) && defined(_UNICODE)
128 return tinydir_file_open(file, UTF8_to_wchar(path).c_str());
129#else
130 return tinydir_file_open(file, path);
131#endif
132}
133
134} // namespace impl
135
136
137template<bool FileOnly=true, typename Filter=impl::IsAnyFile>
138class DirWalk {
139public:
140 explicit DirWalk(const char* path, char try_pdbid='\0') {
141 if (impl::utf8_tinydir_file_open(&top_, path) != -1)
142 return;
143 if (try_pdbid != '\0' && is_pdb_code(path)) {
144 std::string epath = expand_pdb_code_to_path(path, try_pdbid, true);
145 if (impl::utf8_tinydir_file_open(&top_, epath.c_str()) != -1)
146 return;
147 sys_fail("Cannot open " + epath);
148 }
149 sys_fail("Cannot open " + std::string(path));
150 }
151 explicit DirWalk(const std::string& path, char try_pdbid='\0')
152 : DirWalk(path.c_str(), try_pdbid) {}
154 for (auto& d : dirs_)
155 tinydir_close(&d.second);
156 }
157 void push_dir(size_t cur_pos, const _tinydir_char_t* path) {
158 dirs_.emplace_back();
159 dirs_.back().first = cur_pos;
160 if (tinydir_open_sorted(&dirs_.back().second, path) == -1)
161 sys_fail("Cannot open directory " + as_utf8(path));
162 }
163 size_t pop_dir() {
164 assert(!dirs_.empty());
165 size_t old_pos = dirs_.back().first;
166 tinydir_close(&dirs_.back().second);
167 dirs_.pop_back();
168 return old_pos;
169 }
170
171 struct Iter {
173 size_t cur;
174
175 const tinydir_dir& get_dir() const { return walk.dirs_.back().second; }
176
177 const tinydir_file& get() const {
178 if (walk.dirs_.empty())
179 return walk.top_;
181 return get_dir()._files[cur];
182 }
183
184 std::string operator*() const { return as_utf8(get().path); }
185
186 // checks for "." and ".."
187 bool is_special(const _tinydir_char_t* name) const {
188 return name[0] == '.' && (name[1] == '\0' ||
189 (name[1] == '.' && name[2] == '\0'));
190 }
191
192 size_t depth() const { return walk.dirs_.size(); }
193
194 void next() { // depth first
195 const tinydir_file& tf = get();
196 if (tf.is_dir) {
197 walk.push_dir(cur, tf.path);
198 cur = 0;
199 } else {
200 cur++;
201 }
202 while (!walk.dirs_.empty()) {
203 if (cur == get_dir().n_files)
204 cur = walk.pop_dir() + 1;
205 else if (is_special(get_dir()._files[cur].name))
206 cur++;
207 else
208 break;
209 }
210 }
211
212 void operator++() {
213 for (;;) {
214 next();
215 const tinydir_file& f = get();
216 if ((!FileOnly && f.is_dir)
217 || (!f.is_dir && walk.filter.check(as_utf8(f.name)))
219 || (depth() == 0 && cur == 1))
220 break;
221 }
222 }
223
224 // == and != is used only to compare with end()
225 bool operator==(const Iter& o) const { return depth()==0 && cur == o.cur; }
226 bool operator!=(const Iter& o) const { return !operator==(o); }
227 };
228
230 Iter it{*this, 0};
231 if (FileOnly && !is_single_file()) // i.e. the top item is a directory
232 ++it;
233 return it;
234 }
235
236 Iter end() { return Iter{*this, 1}; }
237 bool is_single_file() { return !top_.is_dir; }
238
239private:
240 friend struct Iter;
241 tinydir_file top_;
242 std::vector<std::pair<size_t, tinydir_dir>> dirs_;
243protected:
245};
246
251
252struct GlobWalk : public DirWalk<true, impl::IsMatchingFile> {
253 GlobWalk(const std::string& path, const std::string& glob) : DirWalk(path) {
254 filter.pattern = glob;
255 }
256};
257
258} // namespace gemmi
259#endif
void push_dir(size_t cur_pos, const _tinydir_char_t *path)
Definition dirwalk.hpp:157
bool is_single_file()
Definition dirwalk.hpp:237
DirWalk(const char *path, char try_pdbid='\0')
Definition dirwalk.hpp:140
size_t pop_dir()
Definition dirwalk.hpp:163
DirWalk(const std::string &path, char try_pdbid='\0')
Definition dirwalk.hpp:151
bool is_pdb_code(const std::string &str)
Definition pdb_id.hpp:16
std::wstring UTF8_to_wchar(const char *in)
Definition utf.hpp:12
std::string expand_pdb_code_to_path(const std::string &code, char type, bool throw_if_unset=false)
Call it after checking the code with gemmi::is_pdb_code(code).
Definition pdb_id.hpp:26
std::string wchar_to_UTF8(const wchar_t *in)
Definition utf.hpp:42
GEMMI_COLD void sys_fail(const std::string &msg)
Definition fail.hpp:71
bool glob_match(const std::string &pattern, const std::string &str)
Definition dirwalk.hpp:52
bool giends_with(const std::string &str, const std::string &suffix)
Definition util.hpp:105
std::string as_utf8(const _tinydir_char_t *path)
Definition dirwalk.hpp:43
const tinydir_dir & get_dir() const
Definition dirwalk.hpp:175
std::string operator*() const
Definition dirwalk.hpp:184
bool operator==(const Iter &o) const
Definition dirwalk.hpp:225
bool is_special(const _tinydir_char_t *name) const
Definition dirwalk.hpp:187
const tinydir_file & get() const
Definition dirwalk.hpp:177
size_t depth() const
Definition dirwalk.hpp:192
bool operator!=(const Iter &o) const
Definition dirwalk.hpp:226
GlobWalk(const std::string &path, const std::string &glob)
Definition dirwalk.hpp:253