| /* compress.c - deflate/inflate code for zip, gzip, zlib, and raw |
| * |
| * Copyright 2014 Rob Landley <rob@landley.net> |
| * |
| * The inflate/deflate code lives here, so the various things that use it |
| * either live here or call these commands to pipe data through them. |
| * |
| * Divergence from posix: replace obsolete/patented "compress" with mutiplexer. |
| * (gzip already replaces "uncompress".) |
| * |
| * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip) |
| * LSB 4.1 has gzip, gunzip, and zcat |
| * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout |
| |
| // Accept many different kinds of command line argument. |
| // Leave Lrg at end so flag values line up. |
| |
| USE_COMPRESS(NEWTOY(compress, "zcd9lrg[-cd][!zgLr]", TOYFLAG_USR|TOYFLAG_BIN)) |
| USE_GZIP(NEWTOY(gzip, USE_GZIP_D("d")"19dcflqStvgLRz[!gLRz]", TOYFLAG_USR|TOYFLAG_BIN)) |
| USE_ZCAT(NEWTOY(zcat, 0, TOYFLAG_USR|TOYFLAG_BIN)) |
| USE_GUNZIP(NEWTOY(gunzip, "cflqStv", TOYFLAG_USR|TOYFLAG_BIN)) |
| |
| //zip unzip gzip gunzip zcat |
| |
| config COMPRESS |
| bool "compress" |
| default n |
| help |
| usage: compress [-zgLR19] [FILE] |
| |
| Compress or decompress file (or stdin) using "deflate" algorithm. |
| |
| -1 min compression |
| -9 max compression (default) |
| -g gzip (default) |
| -L zlib |
| -R raw |
| -z zip |
| |
| config GZIP |
| bool "gzip" |
| default y |
| depends on COMPRESS |
| help |
| usage: gzip [-19cfqStvzgLR] [FILE...] |
| |
| Compess (deflate) file(s). With no files, compress stdin to stdout. |
| |
| On successful decompression, compressed files are replaced with the |
| uncompressed version. The input file is removed and replaced with |
| a new file without the .gz extension (with same ownership/permissions). |
| |
| -1 Minimal compression (fastest) |
| -9 Max compression (default) |
| -c cat to stdout (act as zcat) |
| -f force (if output file exists, input is tty, unrecognized extension) |
| -q quiet (no warnings) |
| -S specify exension (default .*) |
| -t test compressed file(s) |
| -v verbose (like -l, but compress files) |
| |
| Compression type: |
| -g gzip (default) -L zlib -R raw -z zip |
| |
| config GZIP_D |
| bool |
| default y |
| depends on GZIP && DECOMPRESS |
| help |
| usage: gzip [-d] |
| |
| -d decompress (act as gunzip) |
| |
| config DECOMPRESS |
| bool "decompress" |
| default n |
| help |
| usage: compress [-zglrcd9] [FILE] |
| |
| Compress or decompress file (or stdin) using "deflate" algorithm. |
| |
| -c compress with -g gzip (default) -l zlib -r raw -z zip |
| -d decompress (autodetects type) |
| |
| |
| config ZCAT |
| bool "zcat" |
| default y |
| depends on DECOMPRESS |
| help |
| usage: zcat [FILE...] |
| |
| Decompress deflated file(s) to stdout |
| |
| config GUNZIP |
| bool "gunzip" |
| default y |
| depends on DECOMPRESS |
| help |
| usage: gunzip [-cflqStv] [FILE...] |
| |
| Decompess (deflate) file(s). With no files, compress stdin to stdout. |
| |
| On successful decompression, compressed files are replaced with the |
| uncompressed version. The input file is removed and replaced with |
| a new file without the .gz extension (with same ownership/permissions). |
| |
| -c cat to stdout (act as zcat) |
| -f force (output file exists, input is tty, unrecognized extension) |
| -l list compressed/uncompressed/ratio/name for each input file. |
| -q quiet (no warnings) |
| -S specify exension (default .*) |
| -t test compressed file(s) |
| -v verbose (like -l, but decompress files) |
| */ |
| |
| #define FOR_compress |
| #include "toys.h" |
| |
| GLOBALS( |
| // Huffman codes: base offset and extra bits tables (length and distance) |
| char lenbits[29], distbits[30]; |
| unsigned short lenbase[29], distbase[30]; |
| void *fixdisthuff, *fixlithuff; |
| |
| // CRC |
| void (*crcfunc)(char *data, int len); |
| unsigned crc; |
| |
| // Compressed data buffer |
| char *data; |
| unsigned pos, len; |
| int infd, outfd; |
| |
| // Tables only used for deflation |
| unsigned short *hashhead, *hashchain; |
| ) |
| |
| // little endian bit buffer |
| struct bitbuf { |
| int fd, bitpos, len, max; |
| char buf[]; |
| }; |
| |
| // malloc a struct bitbuf |
| struct bitbuf *bitbuf_init(int fd, int size) |
| { |
| struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size); |
| |
| bb->max = size; |
| bb->fd = fd; |
| |
| return bb; |
| } |
| |
| // Advance bitpos without the overhead of recording bits |
| void bitbuf_skip(struct bitbuf *bb, int bits) |
| { |
| int pos = bb->bitpos + bits, len = bb->len << 3; |
| |
| while (pos >= len) { |
| pos -= len; |
| len = (bb->len = read(bb->fd, bb->buf, bb->max)) << 3; |
| if (bb->len < 1) perror_exit("inflate EOF"); |
| } |
| bb->bitpos = pos; |
| } |
| |
| // Optimized single bit inlined version |
| static inline int bitbuf_bit(struct bitbuf *bb) |
| { |
| int bufpos = bb->bitpos>>3; |
| |
| if (bufpos == bb->len) { |
| bitbuf_skip(bb, 0); |
| bufpos = 0; |
| } |
| |
| return (bb->buf[bufpos]>>(bb->bitpos++&7))&1; |
| } |
| |
| // Fetch the next X bits from the bitbuf, little endian |
| unsigned bitbuf_get(struct bitbuf *bb, int bits) |
| { |
| int result = 0, offset = 0; |
| |
| while (bits) { |
| int click = bb->bitpos >> 3, blow, blen; |
| |
| // Load more data if buffer empty |
| if (click == bb->len) bitbuf_skip(bb, click = 0); |
| |
| // grab bits from next byte |
| blow = bb->bitpos & 7; |
| blen = 8-blow; |
| if (blen > bits) blen = bits; |
| result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset; |
| offset += blen; |
| bits -= blen; |
| bb->bitpos += blen; |
| } |
| |
| return result; |
| } |
| |
| void bitbuf_flush(struct bitbuf *bb) |
| { |
| if (!bb->bitpos) return; |
| |
| xwrite(bb->fd, bb->buf, (bb->bitpos+7)/8); |
| memset(bb->buf, 0, bb->max); |
| bb->bitpos = 0; |
| } |
| |
| void bitbuf_put(struct bitbuf *bb, int data, int len) |
| { |
| while (len) { |
| int click = bb->bitpos >> 3, blow, blen; |
| |
| // Flush buffer if necessary |
| if (click == bb->max) { |
| bitbuf_flush(bb); |
| click = 0; |
| } |
| blow = bb->bitpos & 7; |
| blen = 8-blow; |
| if (blen > len) blen = len; |
| bb->buf[click] |= data << blow; |
| bb->bitpos += blen; |
| data >>= blen; |
| len -= blen; |
| } |
| } |
| |
| static void output_byte(char sym) |
| { |
| int pos = TT.pos++ & 32767; |
| |
| TT.data[pos] = sym; |
| |
| if (!pos) { |
| xwrite(TT.outfd, TT.data, 32768); |
| if (TT.crcfunc) TT.crcfunc(TT.data, 32768); |
| } |
| } |
| |
| // Huffman coding uses bits to traverse a binary tree to a leaf node, |
| // By placing frequently occurring symbols at shorter paths, frequently |
| // used symbols may be represented in fewer bits than uncommon symbols. |
| |
| struct huff { |
| unsigned short length[16]; |
| unsigned short symbol[288]; |
| }; |
| |
| // Create simple huffman tree from array of bit lengths. |
| |
| // The symbols in the huffman trees are sorted (first by bit length |
| // of the code to reach them, then by symbol number). This means that given |
| // the bit length of each symbol, we can construct a unique tree. |
| static void len2huff(struct huff *huff, char bitlen[], int len) |
| { |
| int offset[16]; |
| int i; |
| |
| // Count number of codes at each bit length |
| memset(huff, 0, sizeof(struct huff)); |
| for (i = 0; i<len; i++) huff->length[bitlen[i]]++; |
| |
| // Sort symbols by bit length. (They'll remain sorted by symbol within that.) |
| *huff->length = *offset = 0; |
| for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1]; |
| |
| for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i; |
| } |
| |
| // Fetch and decode next huffman coded symbol from bitbuf. |
| // This takes advantage of the sorting to navigate the tree as an array: |
| // each time we fetch a bit we have all the codes at that bit level in |
| // order with no gaps. |
| static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff) |
| { |
| unsigned short *length = huff->length; |
| int start = 0, offset = 0; |
| |
| // Traverse through the bit lengths until our code is in this range |
| for (;;) { |
| offset = (offset << 1) | bitbuf_bit(bb); |
| start += *++length; |
| if ((offset -= *length) < 0) break; |
| if ((length - huff->length) & 16) error_exit("bad symbol"); |
| } |
| |
| return huff->symbol[start + offset]; |
| } |
| |
| // Decompress deflated data from bitbuf to TT.outfd. |
| static void inflate(struct bitbuf *bb) |
| { |
| TT.crc = ~0; |
| // repeat until spanked |
| for (;;) { |
| int final, type; |
| |
| final = bitbuf_get(bb, 1); |
| type = bitbuf_get(bb, 2); |
| |
| if (type == 3) error_exit("bad type"); |
| |
| // Uncompressed block? |
| if (!type) { |
| int len, nlen; |
| |
| // Align to byte, read length |
| bitbuf_skip(bb, (8-bb->bitpos)&7); |
| len = bitbuf_get(bb, 16); |
| nlen = bitbuf_get(bb, 16); |
| if (len != (0xffff & ~nlen)) error_exit("bad len"); |
| |
| // Dump literal output data |
| while (len) { |
| int pos = bb->bitpos >> 3, bblen = bb->len - pos; |
| char *p = bb->buf+pos; |
| |
| // dump bytes until done or end of current bitbuf contents |
| if (bblen > len) bblen = len; |
| pos = bblen; |
| while (pos--) output_byte(*(p++)); |
| bitbuf_skip(bb, bblen << 3); |
| len -= bblen; |
| } |
| |
| // Compressed block |
| } else { |
| struct huff *disthuff, *lithuff; |
| |
| // Dynamic huffman codes? |
| if (type == 2) { |
| struct huff *h2 = ((struct huff *)toybuf)+1; |
| int i, litlen, distlen, hufflen; |
| char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b" |
| "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits; |
| |
| // The huffman trees are stored as a series of bit lengths |
| litlen = bitbuf_get(bb, 5)+257; // max 288 |
| distlen = bitbuf_get(bb, 5)+1; // max 32 |
| hufflen = bitbuf_get(bb, 4)+4; // max 19 |
| |
| // The literal and distance codes are themselves compressed, in |
| // a complicated way: an array of bit lengths (hufflen many |
| // entries, each 3 bits) is used to fill out an array of 19 entries |
| // in a magic order, leaving the rest 0. Then make a tree out of it: |
| memset(bits = toybuf+1, 0, 19); |
| for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3); |
| len2huff(h2, bits, 19); |
| |
| // Use that tree to read in the literal and distance bit lengths |
| for (i = 0; i < litlen + distlen;) { |
| int sym = huff_and_puff(bb, h2); |
| |
| // 0-15 are literals, 16 = repeat previous code 3-6 times, |
| // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit) |
| if (sym < 16) bits[i++] = sym; |
| else { |
| int len = sym & 2; |
| |
| len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2); |
| memset(bits+i, bits[i-1] * !(sym&3), len); |
| i += len; |
| } |
| } |
| if (i > litlen+distlen) error_exit("bad tree"); |
| |
| len2huff(lithuff = h2, bits, litlen); |
| len2huff(disthuff = ((struct huff *)toybuf)+2, bits+litlen, distlen); |
| |
| // Static huffman codes |
| } else { |
| lithuff = TT.fixlithuff; |
| disthuff = TT.fixdisthuff; |
| } |
| |
| // Use huffman tables to decode block of compressed symbols |
| for (;;) { |
| int sym = huff_and_puff(bb, lithuff); |
| |
| // Literal? |
| if (sym < 256) output_byte(sym); |
| |
| // Copy range? |
| else if (sym > 256) { |
| int len, dist; |
| |
| sym -= 257; |
| len = TT.lenbase[sym] + bitbuf_get(bb, TT.lenbits[sym]); |
| sym = huff_and_puff(bb, disthuff); |
| dist = TT.distbase[sym] + bitbuf_get(bb, TT.distbits[sym]); |
| sym = TT.pos & 32767; |
| |
| while (len--) output_byte(TT.data[(TT.pos-dist) & 32767]); |
| |
| // End of block |
| } else break; |
| } |
| } |
| |
| // Was that the last block? |
| if (final) break; |
| } |
| |
| if (TT.pos & 32767) { |
| xwrite(TT.outfd, TT.data, TT.pos & 32767); |
| if (TT.crcfunc) TT.crcfunc(TT.data, TT.pos & 32767); |
| } |
| } |
| |
| // Deflate from TT.infd to bitbuf |
| // For deflate, TT.len = input read, TT.pos = input consumed |
| static void deflate(struct bitbuf *bb) |
| { |
| char *data = TT.data; |
| int len, final = 0; |
| |
| TT.crc = ~0; |
| |
| while (!final) { |
| // Read next half-window of data if we haven't hit EOF yet. |
| len = readall(TT.infd, data+(TT.len&32768), 32768); |
| if (len < 0) perror_exit("read"); // todo: add filename |
| if (len != 32768) final++; |
| if (TT.crcfunc) TT.crcfunc(data+(TT.len&32768), len); |
| // TT.len += len; crcfunc advances len |
| |
| // store block as literal |
| bitbuf_put(bb, final, 1); |
| bitbuf_put(bb, 0, 1); |
| |
| bitbuf_put(bb, 0, (8-bb->bitpos)&7); |
| bitbuf_put(bb, len, 16); |
| bitbuf_put(bb, 0xffff & ~len, 16); |
| |
| // repeat until spanked |
| while (TT.pos != TT.len) { |
| unsigned pos = TT.pos & 65535; |
| |
| bitbuf_put(bb, data[pos], 8); |
| |
| // need to refill buffer? |
| if (!(32767 & ++TT.pos) && !final) break; |
| } |
| } |
| bitbuf_flush(bb); |
| } |
| |
| // Allocate memory for deflate/inflate. |
| static void init_deflate(int compress) |
| { |
| int i, n = 1; |
| |
| // compress needs 64k data and 32k each for hashhead and hashchain. |
| // decompress just needs 32k data. |
| TT.data = xmalloc(32768*(compress ? 4 : 1)); |
| if (compress) { |
| TT.hashhead = (unsigned short *)(TT.data + 65536); |
| TT.hashchain = (unsigned short *)(TT.data + 65536 + 32768); |
| } |
| |
| // Calculate lenbits, lenbase, distbits, distbase |
| *TT.lenbase = 3; |
| for (i = 0; i<sizeof(TT.lenbits)-1; i++) { |
| if (i>4) { |
| if (!(i&3)) { |
| TT.lenbits[i]++; |
| n <<= 1; |
| } |
| if (i == 27) n--; |
| else TT.lenbits[i+1] = TT.lenbits[i]; |
| } |
| TT.lenbase[i+1] = n + TT.lenbase[i]; |
| } |
| n = 0; |
| for (i = 0; i<sizeof(TT.distbits); i++) { |
| TT.distbase[i] = 1<<n; |
| if (i) TT.distbase[i] += TT.distbase[i-1]; |
| if (i>3 && !(i&1)) n++; |
| TT.distbits[i] = n; |
| } |
| |
| // Init fixed huffman tables |
| for (i=0; i<288; i++) toybuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279); |
| len2huff(TT.fixlithuff = ((struct huff *)toybuf)+3, toybuf, 288); |
| memset(toybuf, 5, 30); |
| len2huff(TT.fixdisthuff = ((struct huff *)toybuf)+4, toybuf, 30); |
| } |
| |
| // Return true/false whether we consumed a gzip header. |
| static int is_gzip(struct bitbuf *bb) |
| { |
| int flags; |
| |
| // Confirm signature |
| if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31) |
| return 0; |
| bitbuf_skip(bb, 6*8); |
| |
| // Skip extra, name, comment, header CRC fields |
| if (flags & 4) bitbuf_skip(bb, 16); |
| if (flags & 8) while (bitbuf_get(bb, 8)); |
| if (flags & 16) while (bitbuf_get(bb, 8)); |
| if (flags & 2) bitbuf_skip(bb, 16); |
| |
| return 1; |
| } |
| |
| void gzip_crc(char *data, int len) |
| { |
| int i; |
| unsigned crc, *crc_table = (unsigned *)(toybuf+sizeof(toybuf)-1024); |
| |
| crc = TT.crc; |
| for (i=0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8); |
| TT.crc = crc; |
| TT.len += len; |
| } |
| |
| static void do_gzip(int fd, char *name) |
| { |
| struct bitbuf *bb = bitbuf_init(1, sizeof(toybuf)); |
| |
| // Header from RFC 1952 section 2.2: |
| // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none), |
| // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression), |
| // Operating System (FF=unknown) |
| |
| TT.infd = fd; |
| xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10); |
| |
| // Use last 1k of toybuf for little endian crc table |
| crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1); |
| TT.crcfunc = gzip_crc; |
| |
| deflate(bb); |
| |
| // tail: crc32, len32 |
| |
| bitbuf_put(bb, 0, (8-bb->bitpos)&7); |
| bitbuf_put(bb, ~TT.crc, 32); |
| bitbuf_put(bb, TT.len, 32); |
| |
| bitbuf_flush(bb); |
| free(bb); |
| } |
| |
| static void do_zcat(int fd, char *name) |
| { |
| struct bitbuf *bb = bitbuf_init(fd, sizeof(toybuf)); |
| |
| if (!is_gzip(bb)) error_exit("not gzip"); |
| TT.outfd = 1; |
| |
| // Use last 1k of toybuf for little endian crc table |
| crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1); |
| TT.crcfunc = gzip_crc; |
| |
| inflate(bb); |
| |
| // tail: crc32, len32 |
| |
| bitbuf_skip(bb, (8-bb->bitpos)&7); |
| if (~TT.crc != bitbuf_get(bb, 32) || TT.len != bitbuf_get(bb, 32)) |
| error_exit("bad crc"); |
| free(bb); |
| } |
| |
| // Parse many different kinds of command line argument: |
| |
| void compress_main(void) |
| { |
| // todo: this |
| printf("hello world"); |
| } |
| |
| //#define CLEANUP_compress |
| //#define FOR_zcat |
| //#include "generated/flags.h" |
| |
| void zcat_main(void) |
| { |
| init_deflate(0); |
| |
| loopfiles(toys.optargs, do_zcat); |
| } |
| |
| void gunzip_main(void) |
| { |
| init_deflate(0); |
| |
| loopfiles(toys.optargs, do_zcat); |
| } |
| |
| void gzip_main(void) |
| { |
| init_deflate(1); |
| |
| loopfiles(toys.optargs, do_gzip); |
| } |