Gemmi C++ API
Loading...
Searching...
No Matches
gz.hpp
Go to the documentation of this file.
1// Copyright 2017 Global Phasing Ltd.
2//
3// Functions for transparent reading of gzipped files. Uses zlib.
4
5#ifndef GEMMI_GZ_HPP_
6#define GEMMI_GZ_HPP_
7#include <cassert>
8#include <cstdio> // fseek, ftell, fread
9#include <climits> // INT_MAX
10#include <string>
11#include <zlib.h>
12#include "fail.hpp" // fail, sys_fail
13#include "fileutil.hpp" // file_open
14#include "input.hpp" // BasicInput
15#include "util.hpp" // iends_with
16
17namespace gemmi {
18
19// Throws if the size is not found or if it is suspicious.
20// Anything outside of the arbitrary limits from 1 to 10x of the compressed
21// size looks suspicious to us.
22// **This function should not be relied upon.**
23// In particular, if the return values is >= 4GiB - it's only a guess.
24inline size_t estimate_uncompressed_size(const std::string& path) {
25 fileptr_t f = file_open(path.c_str(), "rb");
26 if (std::fseek(f.get(), -4, SEEK_END) != 0)
27 sys_fail("fseek() failed (empty file?): " + path);
28 long pos = std::ftell(f.get());
29 if (pos <= 0)
30 sys_fail("ftell() failed on " + path);
31 size_t gzipped_size = pos + 4;
32 unsigned char buf[4];
33 if (std::fread(buf, 1, 4, f.get()) != 4)
34 sys_fail("Failed to read last 4 bytes of: " + path);
35 unsigned orig_size = (buf[3] << 24) | (buf[2] << 16) | (buf[1] << 8) | buf[0];
36 if (orig_size + 100 < gzipped_size || orig_size > 100 * gzipped_size) {
37 // The size is stored as 32-bit number. If the original size exceeds 4GiB,
38 // the stored number is modulo 4 GiB. So we just guess...
39 constexpr size_t max_uint = 4294967295U;
40 if (gzipped_size > max_uint / 6)
41 return max_uint + (sizeof(size_t) > 4 ? orig_size : 0);
42 fail("Cannot determine uncompressed size of " + path +
43 "\nWould it be " + std::to_string(gzipped_size) + " -> " +
44 std::to_string(orig_size) + " bytes?");
45 }
46 return orig_size;
47}
48
49inline size_t big_gzread(gzFile file, void* buf, size_t len) {
50 // In zlib >= 1.2.9 we could use gzfread()
51 // return gzfread(buf, len, 1, f) == 1;
52 size_t read_bytes = 0;
53 while (len > INT_MAX) {
54 int ret = gzread(file, buf, INT_MAX);
55 read_bytes += ret;
56 if (ret != INT_MAX)
57 return read_bytes;
58 len -= INT_MAX;
59 buf = (char*) buf + INT_MAX;
60 }
61 read_bytes += gzread(file, buf, (unsigned) len);
62 return read_bytes;
63}
64
65class MaybeGzipped : public BasicInput {
66public:
67 struct GzStream {
69 char* gets(char* line, int size) { return gzgets(f, line, size); }
70 int getc() { return gzgetc(f); }
71 bool read(void* buf, size_t len) { return big_gzread(f, buf, len) == len; }
72 };
73
74 explicit MaybeGzipped(const std::string& path)
75 : BasicInput(path), file_(nullptr) {}
77 if (file_)
78#if ZLIB_VERNUM >= 0x1235
79 gzclose_r(file_);
80#else
81 gzclose(file_);
82#endif
83 }
84
85 size_t gzread_checked(void* buf, size_t len) {
86 size_t read_bytes = big_gzread(file_, buf, len);
87 if (read_bytes != len && !gzeof(file_)) {
88 int errnum = 0;
89 std::string err_str = gzerror(file_, &errnum);
90 if (errnum == Z_ERRNO)
91 sys_fail("failed to read " + path());
92 if (errnum)
93 fail("Error reading " + path() + ": " + err_str);
94 }
95 if (read_bytes > len) // should never happen
96 fail("Error reading " + path());
97 return read_bytes;
98 }
99
100 bool is_compressed() const { return iends_with(path(), ".gz"); }
101 std::string basepath() const {
102 return is_compressed() ? path().substr(0, path().size() - 3) : path();
103 }
104
106 if (!is_compressed())
108 size_t size = (limit == 0 ? estimate_uncompressed_size(path()) : limit);
109 open();
110 if (size > 3221225471)
111 // if this exception is changed adjust prog/cif2mtz.cpp
112 fail("For now gz files above 3 GiB uncompressed are not supported.\n"
113 "To read " + path() + " first uncompress it.");
114 CharArray mem(size);
115 size_t read_bytes = gzread_checked(mem.data(), size);
116 // if the file is shorter than the size from header, adjust size
117 if (read_bytes < size) {
118 mem.set_size(read_bytes); // should we call resize() here
119 } else if (limit == 0) { // read_bytes == size
120 // if the file is longer than the size from header, read in the rest
121 int next_char;
122 while (!gzeof(file_) && (next_char = gzgetc(file_)) != -1) {
123 if (mem.size() > 3221225471)
124 fail("For now gz files above 3 GiB uncompressed are not supported.\n"
125 "To read " + path() + " first uncompress it.");
126 gzungetc(next_char, file_);
127 size_t old_size = mem.size();
128 mem.resize(2 * old_size);
129 size_t n = gzread_checked(mem.data() + old_size, old_size);
130 mem.set_size(old_size + n);
131 }
132 }
133 return mem;
134 }
135
138 open();
139#if ZLIB_VERNUM >= 0x1235
140 gzbuffer(file_, 64*1024);
141#endif
142 return GzStream{file_};
143 }
144
145private:
146 gzFile file_;
147
148 void open() {
149 file_ = gzopen(path().c_str(), "rb");
150 if (!file_)
151 sys_fail("Failed to gzopen " + path());
152 }
153};
154
155} // namespace gemmi
156
157#endif
CharArray uncompress_into_buffer(size_t=0)
Definition input.hpp:148
const std::string & path() const
Definition input.hpp:136
size_t gzread_checked(void *buf, size_t len)
Definition gz.hpp:85
MaybeGzipped(const std::string &path)
Definition gz.hpp:74
GzStream get_uncompressing_stream()
Definition gz.hpp:136
bool is_compressed() const
Definition gz.hpp:100
CharArray uncompress_into_buffer(size_t limit=0)
Definition gz.hpp:105
std::string basepath() const
Definition gz.hpp:101
std::unique_ptr< std::FILE, decltype(&std::fclose)> fileptr_t
Definition fileutil.hpp:37
size_t estimate_uncompressed_size(const std::string &path)
Definition gz.hpp:24
bool iends_with(const std::string &str, const std::string &suffix)
Definition util.hpp:98
fileptr_t file_open(const char *path, const char *mode)
Definition fileutil.hpp:39
GEMMI_COLD void sys_fail(const std::string &msg)
Definition fail.hpp:71
void fail(const std::string &msg)
Definition fail.hpp:59
size_t big_gzread(gzFile file, void *buf, size_t len)
Definition gz.hpp:49
char * gets(char *line, int size)
Definition gz.hpp:69
bool read(void *buf, size_t len)
Definition gz.hpp:71