1 /// @file htslib/bgzf.h 2 /// Low-level routines for direct BGZF operations. 3 /* 4 Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 5 2011, 2012 Attractive Chaos <attractor@live.co.uk> 6 Copyright (C) 2009, 2013, 2014, 2017, 2018-2019 Genome Research Ltd 7 8 Permission is hereby granted, free of charge, to any person obtaining a copy 9 of this software and associated documentation files (the "Software"), to deal 10 in the Software without restriction, including without limitation the rights 11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 copies of the Software, and to permit persons to whom the Software is 13 furnished to do so, subject to the following conditions: 14 15 The above copyright notice and this permission notice shall be included in 16 all copies or substantial portions of the Software. 17 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 THE SOFTWARE. 25 */ 26 27 /* The BGZF library was originally written by Bob Handsaker from the Broad 28 * Institute. It was later improved by the SAMtools developers. */ 29 30 module htslib.bgzf; 31 32 import core.stdc.stdio; 33 import core.sys.posix.sys.types; 34 35 import htslib.hfile : hFILE; 36 import htslib.kstring; 37 38 @system: 39 nothrow: 40 @nogc: 41 42 // ssize_t doesn't exist in core.sys.posix.sys.types for windows builds 43 version(Windows){ 44 version(Win32){ 45 alias ssize_t = int; 46 } 47 version(Win64){ 48 alias ssize_t = long; 49 } 50 } 51 52 extern (C): 53 54 enum BGZF_BLOCK_SIZE = 0xff00; // make sure compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE 55 enum BGZF_MAX_BLOCK_SIZE = 0x10000; 56 57 enum BGZF_ERR_ZLIB = 1; 58 enum BGZF_ERR_HEADER = 2; 59 enum BGZF_ERR_IO = 4; 60 enum BGZF_ERR_MISUSE = 8; 61 enum BGZF_ERR_MT = 16; // stream cannot be multi-threaded 62 enum BGZF_ERR_CRC = 32; 63 64 struct hts_tpool; 65 struct bgzf_mtaux_t; 66 struct __bgzidx_t; 67 alias bgzidx_t = __bgzidx_t; 68 struct bgzf_cache_t; 69 struct z_stream_s; 70 71 struct BGZF 72 { 73 import std.bitmanip : bitfields; 74 75 mixin(bitfields!( 76 uint, "errcode", 16, 77 uint, "reserved", 1, 78 uint, "is_write", 1, 79 uint, "no_eof_block", 1, 80 uint, "is_be", 1, 81 int, "compress_level", 9, 82 uint, "last_block_eof", 1, 83 uint, "is_compressed", 1, 84 uint, "is_gzip", 1)); 85 86 // Reserved bits should be written as 0; read as "don't care" 87 88 int cache_size; 89 int block_length; 90 int block_clength; 91 int block_offset; 92 long block_address; 93 long uncompressed_address; 94 void* uncompressed_block; 95 void* compressed_block; 96 bgzf_cache_t* cache; 97 hFILE* fp; // actual file handle 98 bgzf_mtaux_t* mt; // only used for multi-threading 99 bgzidx_t* idx; // BGZF index 100 int idx_build_otf; // build index on the fly, set by bgzf_index_build_init() 101 z_stream_s* gz_stream; // for gzip-compressed files 102 long seeked; // virtual offset of last seek 103 } 104 105 /****************** 106 * Basic routines * 107 ******************/ 108 109 /** 110 * Open an existing file descriptor for reading or writing. 111 * 112 * @param fd file descriptor 113 * Note that the file must be opened in binary mode, or else 114 * there will be problems on platforms that make a difference 115 * between text and binary mode. 116 * @param mode mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for 117 * writing, 'a' for appending, 'g' for gzip rather than BGZF 118 * compression (with 'w' only), and digit specifies the zlib 119 * compression level. 120 * Note that there is a distinction between 'u' and '0': the 121 * first yields plain uncompressed output whereas the latter 122 * outputs uncompressed data wrapped in the zlib format. 123 * @return BGZF file handler; 0 on error 124 */ 125 BGZF* bgzf_dopen(int fd, const(char)* mode); 126 127 alias bgzf_fdopen = bgzf_dopen; // for backward compatibility 128 129 /** 130 * Open the specified file for reading or writing. 131 */ 132 BGZF* bgzf_open(const(char)* path, const(char)* mode); 133 134 /** 135 * Open an existing hFILE stream for reading or writing. 136 */ 137 BGZF* bgzf_hopen(hFILE* fp, const(char)* mode); 138 139 /** 140 * Close the BGZF and free all associated resources. 141 * 142 * @param fp BGZF file handler 143 * @return 0 on success and -1 on error 144 */ 145 int bgzf_close(BGZF* fp); 146 147 /** 148 * Read up to _length_ bytes from the file storing into _data_. 149 * 150 * @param fp BGZF file handler 151 * @param data data array to read into 152 * @param length size of data to read 153 * @return number of bytes actually read; 0 on end-of-file and -1 on error 154 */ 155 ssize_t bgzf_read(BGZF* fp, void* data, size_t length); 156 157 /** 158 * Write _length_ bytes from _data_ to the file. If no I/O errors occur, 159 * the complete _length_ bytes will be written (or queued for writing). 160 * 161 * @param fp BGZF file handler 162 * @param data data array to write 163 * @param length size of data to write 164 * @return number of bytes written (i.e., _length_); negative on error 165 */ 166 ssize_t bgzf_write(BGZF* fp, const(void)* data, size_t length); 167 168 /** 169 * Write _length_ bytes from _data_ to the file, the index will be used to 170 * decide the amount of uncompressed data to be written to each bgzip block. 171 * If no I/O errors occur, the complete _length_ bytes will be written (or 172 * queued for writing). 173 * @param fp BGZF file handler 174 * @param data data array to write 175 * @param length size of data to write 176 * @return number of bytes written (i.e., _length_); negative on error 177 */ 178 ssize_t bgzf_block_write(BGZF* fp, const(void)* data, size_t length); 179 180 /** 181 * Returns the next byte in the file without consuming it. 182 * @param fp BGZF file handler 183 * @return -1 on EOF, 184 * -2 on error, 185 * otherwise the unsigned byte value. 186 */ 187 int bgzf_peek(BGZF* fp); 188 189 /** 190 * Read up to _length_ bytes directly from the underlying stream without 191 * decompressing. Bypasses BGZF blocking, so must be used with care in 192 * specialised circumstances only. 193 * 194 * @param fp BGZF file handler 195 * @param data data array to read into 196 * @param length number of raw bytes to read 197 * @return number of bytes actually read; 0 on end-of-file and -1 on error 198 */ 199 ssize_t bgzf_raw_read(BGZF* fp, void* data, size_t length); 200 201 /** 202 * Write _length_ bytes directly to the underlying stream without 203 * compressing. Bypasses BGZF blocking, so must be used with care 204 * in specialised circumstances only. 205 * 206 * @param fp BGZF file handler 207 * @param data data array to write 208 * @param length number of raw bytes to write 209 * @return number of bytes actually written; -1 on error 210 */ 211 ssize_t bgzf_raw_write(BGZF* fp, const(void)* data, size_t length); 212 213 /** 214 * Write the data in the buffer to the file. 215 * 216 * @param fp BGZF file handle 217 * @return 0 on success and -1 on error 218 */ 219 int bgzf_flush(BGZF* fp); 220 221 /** 222 * Return a virtual file pointer to the current location in the file. 223 * No interpretation of the value should be made, other than a subsequent 224 * call to bgzf_seek can be used to position the file at the same point. 225 * Return value is non-negative on success. 226 */ 227 pragma(inline, true) 228 extern (D) auto bgzf_tell(T)(auto ref T fp) 229 { 230 return (fp.block_address << 16) | (fp.block_offset & 0xFFFF); 231 } 232 233 /** 234 * Set the file to read from the location specified by _pos_. 235 * 236 * @param fp BGZF file handler 237 * @param pos virtual file offset returned by bgzf_tell() 238 * @param whence must be SEEK_SET 239 * @return 0 on success and -1 on error 240 * 241 * @note It is not permitted to seek on files open for writing, 242 * or files compressed with gzip (as opposed to bgzip). 243 */ 244 long bgzf_seek(BGZF* fp, long pos, int whence); 245 246 /** 247 * Check if the BGZF end-of-file (EOF) marker is present 248 * 249 * @param fp BGZF file handler opened for reading 250 * @return 1 if the EOF marker is present and correct; 251 * 2 if it can't be checked, e.g., because fp isn't seekable; 252 * 0 if the EOF marker is absent; 253 * -1 (with errno set) on error 254 */ 255 int bgzf_check_EOF(BGZF* fp); 256 257 /** Return the file's compression format 258 * 259 * @param fp BGZF file handle 260 * @return A small integer matching the corresponding 261 * `enum htsCompression` value: 262 * - 0 / `no_compression` if the file is uncompressed 263 * - 1 / `gzip` if the file is plain GZIP-compressed 264 * - 2 / `bgzf` if the file is BGZF-compressed 265 * @since 1.4 266 */ 267 int bgzf_compression(BGZF* fp); 268 269 /** 270 * Check if a file is in the BGZF format 271 * 272 * @param fn file name 273 * @return 1 if _fn_ is BGZF; 0 if not or on I/O error 274 */ 275 int bgzf_is_bgzf(const(char)* fn); 276 277 /********************* 278 * Advanced routines * 279 *********************/ 280 281 /** 282 * Set the cache size. Only effective when compiled with -DBGZF_CACHE. 283 * 284 * @param fp BGZF file handler 285 * @param size size of cache in bytes; 0 to disable caching (default) 286 */ 287 void bgzf_set_cache_size(BGZF* fp, int size); 288 289 /** 290 * Flush the file if the remaining buffer size is smaller than _size_ 291 * @return 0 if flushing succeeded or was not needed; negative on error 292 */ 293 int bgzf_flush_try(BGZF* fp, ssize_t size); 294 295 /** 296 * Read one byte from a BGZF file. It is faster than bgzf_read() 297 * @param fp BGZF file handler 298 * @return byte read; -1 on end-of-file or error 299 */ 300 int bgzf_getc(BGZF* fp); 301 302 /** 303 * Read one line from a BGZF file. It is faster than bgzf_getc() 304 * 305 * @param fp BGZF file handler 306 * @param delim delimiter 307 * @param str string to write to; must be initialized 308 * @return length of the string; -1 on end-of-file; <= -2 on error 309 */ 310 int bgzf_getline(BGZF* fp, int delim, kstring_t* str); 311 312 /** 313 * Read the next BGZF block. 314 */ 315 int bgzf_read_block(BGZF* fp); 316 317 /** 318 * Enable multi-threading (when compiled with -DBGZF_MT) via a shared 319 * thread pool. This means both encoder and decoder can balance 320 * usage across a single pool of worker jobs. 321 * 322 * @param fp BGZF file handler; must be opened for writing 323 * @param pool The thread pool (see hts_create_threads) 324 */ 325 int bgzf_thread_pool(BGZF* fp, hts_tpool* pool, int qsize); 326 327 /** 328 * Enable multi-threading (only effective when the library was compiled 329 * with -DBGZF_MT) 330 * 331 * @param fp BGZF file handler; must be opened for writing 332 * @param n_threads #threads used for writing 333 * @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended 334 */ 335 int bgzf_mt(BGZF* fp, int n_threads, int n_sub_blks); 336 337 /** 338 * Compress a single BGZF block. 339 * 340 * @param dst output buffer (must have size >= BGZF_MAX_BLOCK_SIZE) 341 * @param dlen size of output buffer; updated on return to the number 342 * of bytes actually written to dst 343 * @param src buffer to be compressed 344 * @param slen size of data to compress (must be <= BGZF_BLOCK_SIZE) 345 * @param level compression level 346 * @return 0 on success and negative on error 347 */ 348 int bgzf_compress( 349 void* dst, 350 size_t* dlen, 351 const(void)* src, 352 size_t slen, 353 int level); 354 355 /******************* 356 * bgzidx routines * 357 *******************/ 358 359 /** 360 * Position BGZF at the uncompressed offset 361 * 362 * @param fp BGZF file handler; must be opened for reading 363 * @param uoffset file offset in the uncompressed data 364 * @param where must be SEEK_SET 365 * 366 * Returns 0 on success and -1 on error. 367 * 368 * @note It is not permitted to seek on files open for writing, 369 * or files compressed with gzip (as opposed to bgzip). 370 */ 371 int bgzf_useek(BGZF* fp, off_t uoffset, int where); 372 373 /** 374 * Position in uncompressed BGZF 375 * 376 * @param fp BGZF file handler; must be opened for reading 377 * 378 * Returns the current offset on success and -1 on error. 379 */ 380 off_t bgzf_utell(BGZF* fp); 381 382 /** 383 * Tell BGZF to build index while compressing. 384 * 385 * @param fp BGZF file handler; can be opened for reading or writing. 386 * 387 * Returns 0 on success and -1 on error. 388 * 389 * @note This function must be called before any data has been read or 390 * written, and in particular before calling bgzf_mt() on the same 391 * file handle (as threads may start reading data before the index 392 * has been set up). 393 */ 394 int bgzf_index_build_init(BGZF* fp); 395 396 /// Load BGZF index 397 /** 398 * @param fp BGZF file handler 399 * @param bname base name 400 * @param suffix suffix to add to bname (can be NULL) 401 * @return 0 on success and -1 on error. 402 */ 403 int bgzf_index_load(BGZF* fp, const(char)* bname, const(char)* suffix); 404 405 /// Load BGZF index from an hFILE 406 /** 407 * @param fp BGZF file handle 408 * @param idx hFILE to read from 409 * @param name file name (for error reporting only; can be NULL) 410 * @return 0 on success and -1 on error. 411 * 412 * Populates @p fp with index data read from the hFILE handle @p idx. 413 * The file pointer to @idx should point to the start of the index 414 * data when this function is called. 415 * 416 * The file name can optionally be passed in the @p name parameter. This 417 * is only used for printing error messages; if NULL the word "index" is 418 * used instead. 419 */ 420 int bgzf_index_load_hfile(BGZF* fp, hFILE* idx, const(char)* name); 421 422 /// Save BGZF index 423 /** 424 * @param fp BGZF file handler 425 * @param bname base name 426 * @param suffix suffix to add to bname (can be NULL) 427 * @return 0 on success and -1 on error. 428 */ 429 int bgzf_index_dump(BGZF* fp, const(char)* bname, const(char)* suffix); 430 431 /// Write a BGZF index to an hFILE 432 /** 433 * @param fp BGZF file handle 434 * @param idx hFILE to write to 435 * @param name file name (for error reporting only, can be NULL) 436 * @return 0 on success and -1 on error. 437 * 438 * Write index data from @p fp to the file @p idx. 439 * 440 * The file name can optionally be passed in the @p name parameter. This 441 * is only used for printing error messages; if NULL the word "index" is 442 * used instead. 443 */ 444 445 int bgzf_index_dump_hfile(BGZF* fp, hFILE* idx, const(char)* name); 446