Gemmi C++ API
Loading...
Searching...
No Matches
dirwalk.hpp
Go to the documentation of this file.
1// Copyright 2018 Global Phasing Ltd.
2//
3// Classes for iterating over files in a directory tree, top-down,
4// in alphabetical order. Wraps the tinydir library (as we cannot yet
5// depend on C++17 <filesystem>).
6
7// DirWalk<> iterates through all files and directories.
8// CifWalk yields only cif files (either files that end with .cif or .cif.gz,
9// or files that look like SF mmCIF files from wwPDB, e.g. r3aaasf.ent.gz).
10// It's good for traversing a local copy of the wwPDB archive.
11// PdbWalk: .pdb or .ent (optionally with .gz) except r????sf.ent
12// CoorFileWalk: .cif, .pdb or .ent (optionally with .gz)
13// except r????sf.ent and *-sf.cif
14//
15// Usage:
16// for (const std::string& file : gemmi::DirWalk<>(top_dir))
17// do_something(file);
18// or
19// for (const std::string& file : gemmi::CifWalk(top_dir))
20// do_something(file);
21// You should also catch std::runtime_error.
22
23#ifndef GEMMI_DIRWALK_HPP_
24#define GEMMI_DIRWALK_HPP_
25
26#include <string>
27#include <vector>
28#include <cassert>
29#if defined(_MSC_VER) && !defined(NOMINMAX)
30# define NOMINMAX
31#endif
32#include "third_party/tinydir.h"
33
34#include "util.hpp" // for giends_with
35#include "fail.hpp" // for sys_fail
36#include "pdb_id.hpp" // for is_pdb_code, expand_pdb_code_to_path
37#if defined(_WIN32) && defined(_UNICODE)
38 #include "utf.hpp"
39#endif
40
41namespace gemmi {
42
43inline std::string as_utf8(const _tinydir_char_t* path) {
44#if defined(_WIN32) && defined(_UNICODE)
45 return wchar_to_UTF8(path);
46#else
47 return path;
48#endif
49}
50
51// linear-time glob matching: https://research.swtch.com/glob
52inline bool glob_match(const std::string& pattern, const std::string& str) {
53 size_t pat_next = 0;
54 size_t str_next = std::string::npos;
55 size_t pat_pos = 0;
56 size_t str_pos = 0;
57 while (pat_pos < pattern.size() || str_pos < str.size()) {
58 if (pat_pos < pattern.size()) {
59 char c = pattern[pat_pos];
60 if (c == '*') {
62 str_next = str_pos + 1;
63 pat_pos++;
64 continue;
65 }
66 if (str_pos < str.size() && (c == '?' || c == str[str_pos])) {
67 pat_pos++;
68 str_pos++;
69 continue;
70 }
71 }
72 if (str_next > str.size())
73 return false;
76 }
77 return true;
78}
79
80
81namespace impl {
82// the SF mmCIF files from PDB have names such as
83// divided/structure_factors/aa/r3aaasf.ent.gz
84inline bool is_rxsf_ent_filename(const std::string& filename) {
85 return filename[0] == 'r' && giends_with(filename, "sf.ent")
86 && filename.find('.') >= 4;
87}
88
89struct IsMmCifFile { // actually we don't know what kind of cif file it is
90 static bool check(const std::string& filename) {
91 return giends_with(filename, ".cif") || giends_with(filename, ".mmcif");
92 }
93};
94
95struct IsCifFile {
96 static bool check(const std::string& filename) {
97 return giends_with(filename, ".cif") || is_rxsf_ent_filename(filename);
98 }
99};
100
101struct IsPdbFile {
102 static bool check(const std::string& filename) {
103 return giends_with(filename, ".pdb") ||
104 (giends_with(filename, ".ent") && !is_rxsf_ent_filename(filename));
105 }
106};
107
108struct IsCoordinateFile {
109 static bool check(const std::string& filename) {
110 // the SF mmCIF files from RCSB website have names such as 3AAA-sf.cif
111 return IsPdbFile::check(filename) ||
112 (IsMmCifFile::check(filename) && !giends_with(filename, "-sf.cif"));
113 }
114};
115
116struct IsAnyFile {
117 static bool check(const std::string&) { return true; }
118};
119
120struct IsMatchingFile {
121 bool check(const std::string& filename) const {
122 return glob_match(pattern, filename);
123 }
124 std::string pattern;
125};
126
127inline int utf8_tinydir_file_open(tinydir_file* file, const char* path) {
128#if defined(_WIN32) && defined(_UNICODE)
129 return tinydir_file_open(file, UTF8_to_wchar(path).c_str());
130#else
131 return tinydir_file_open(file, path);
132#endif
133}
134
135} // namespace impl
136
137
138template<bool FileOnly=true, typename Filter=impl::IsAnyFile>
139class DirWalk {
140public:
141 explicit DirWalk(const char* path, char try_pdbid='\0') {
142 if (impl::utf8_tinydir_file_open(&top_, path) != -1)
143 return;
144 if (try_pdbid != '\0' && is_pdb_code(path)) {
145 std::string epath = expand_pdb_code_to_path(path, try_pdbid, true);
146 if (impl::utf8_tinydir_file_open(&top_, epath.c_str()) != -1)
147 return;
148 sys_fail("Cannot open " + epath);
149 }
150 sys_fail("Cannot open " + std::string(path));
151 }
152 explicit DirWalk(const std::string& path, char try_pdbid='\0')
153 : DirWalk(path.c_str(), try_pdbid) {}
155 for (auto& d : dirs_)
156 tinydir_close(&d.second);
157 }
158 void push_dir(size_t cur_pos, const _tinydir_char_t* path) {
159 dirs_.emplace_back();
160 dirs_.back().first = cur_pos;
161 if (tinydir_open_sorted(&dirs_.back().second, path) == -1)
162 sys_fail("Cannot open directory " + as_utf8(path));
163 }
164 size_t pop_dir() {
165 assert(!dirs_.empty());
166 size_t old_pos = dirs_.back().first;
167 tinydir_close(&dirs_.back().second);
168 dirs_.pop_back();
169 return old_pos;
170 }
171
172 struct Iter {
174 size_t cur;
175
176 const tinydir_dir& get_dir() const { return walk.dirs_.back().second; }
177
178 const tinydir_file& get() const {
179 if (walk.dirs_.empty())
180 return walk.top_;
182 return get_dir()._files[cur];
183 }
184
185 std::string operator*() const { return as_utf8(get().path); }
186
187 // checks for "." and ".."
188 bool is_special(const _tinydir_char_t* name) const {
189 return name[0] == '.' && (name[1] == '\0' ||
190 (name[1] == '.' && name[2] == '\0'));
191 }
192
193 size_t depth() const { return walk.dirs_.size(); }
194
195 void next() { // depth first
196 const tinydir_file& tf = get();
197 if (tf.is_dir) {
198 walk.push_dir(cur, tf.path);
199 cur = 0;
200 } else {
201 cur++;
202 }
203 while (!walk.dirs_.empty()) {
204 if (cur == get_dir().n_files)
205 cur = walk.pop_dir() + 1;
206 else if (is_special(get_dir()._files[cur].name))
207 cur++;
208 else
209 break;
210 }
211 }
212
213 void operator++() {
214 for (;;) {
215 next();
216 const tinydir_file& f = get();
217 if ((!FileOnly && f.is_dir)
218 || (!f.is_dir && walk.filter.check(as_utf8(f.name)))
220 || (depth() == 0 && cur == 1))
221 break;
222 }
223 }
224
225 // == and != is used only to compare with end()
226 bool operator==(const Iter& o) const { return depth()==0 && cur == o.cur; }
227 bool operator!=(const Iter& o) const { return !operator==(o); }
228 };
229
231 Iter it{*this, 0};
232 if (FileOnly && !is_single_file()) // i.e. the top item is a directory
233 ++it;
234 return it;
235 }
236
237 Iter end() { return Iter{*this, 1}; }
238 bool is_single_file() { return !top_.is_dir; }
239
240private:
241 friend struct Iter;
242 tinydir_file top_;
243 std::vector<std::pair<size_t, tinydir_dir>> dirs_;
244protected:
246};
247
252
253struct GlobWalk : public DirWalk<true, impl::IsMatchingFile> {
254 GlobWalk(const std::string& path, const std::string& glob) : DirWalk(path) {
255 filter.pattern = glob;
256 }
257};
258
259} // namespace gemmi
260#endif
void push_dir(size_t cur_pos, const _tinydir_char_t *path)
Definition dirwalk.hpp:158
bool is_single_file()
Definition dirwalk.hpp:238
DirWalk(const char *path, char try_pdbid='\0')
Definition dirwalk.hpp:141
size_t pop_dir()
Definition dirwalk.hpp:164
DirWalk(const std::string &path, char try_pdbid='\0')
Definition dirwalk.hpp:152
fail(), unreachable() and __declspec/__attribute__ macros
bool is_pdb_code(const std::string &str)
Definition pdb_id.hpp:23
std::wstring UTF8_to_wchar(const char *in)
Definition utf.hpp:12
std::string expand_pdb_code_to_path(const std::string &code, char type, bool throw_if_unset=false)
Call it after checking the code with gemmi::is_pdb_code(code).
Definition pdb_id.hpp:52
std::string wchar_to_UTF8(const wchar_t *in)
Definition utf.hpp:42
GEMMI_COLD void sys_fail(const std::string &msg)
Definition fail.hpp:71
bool glob_match(const std::string &pattern, const std::string &str)
Definition dirwalk.hpp:52
bool giends_with(const std::string &str, const std::string &suffix)
Definition util.hpp:106
std::string as_utf8(const _tinydir_char_t *path)
Definition dirwalk.hpp:43
Handling PDB ID and $PDB_DIR: is_pdb_code(), expand_pdb_code_to_path(), . . .
const tinydir_dir & get_dir() const
Definition dirwalk.hpp:176
std::string operator*() const
Definition dirwalk.hpp:185
bool operator==(const Iter &o) const
Definition dirwalk.hpp:226
bool is_special(const _tinydir_char_t *name) const
Definition dirwalk.hpp:188
const tinydir_file & get() const
Definition dirwalk.hpp:178
size_t depth() const
Definition dirwalk.hpp:193
bool operator!=(const Iter &o) const
Definition dirwalk.hpp:227
GlobWalk(const std::string &path, const std::string &glob)
Definition dirwalk.hpp:254
Conversion between UTF-8 and wchar. Used only for file names on Windows.
Utilities. Mostly for working with strings and vectors.