use crate::ffi::{FileFormat, check_fmt}; use base::libc::{O_RDONLY, O_TRUNC, O_WRONLY}; use base::{Chunker, LoggedResult, Utf8CStr, WriteExt, error, log_err}; use bytemuck::bytes_of_mut; use bzip2::{Compression as BzCompression, write::BzDecoder, write::BzEncoder}; use flate2::{Compression as GzCompression, write::GzEncoder, write::MultiGzDecoder}; use lz4::{ BlockMode, BlockSize, ContentChecksum, Encoder as LZ4FrameEncoder, EncoderBuilder as LZ4FrameEncoderBuilder, block::CompressionMode, liblz4::BlockChecksum, }; use std::cell::Cell; use std::fs::File; use std::io::{BufWriter, Read, Stdin, Stdout, Write, stdin, stdout}; use std::mem::ManuallyDrop; use std::num::NonZeroU64; use std::ops::DerefMut; use std::os::fd::{AsRawFd, FromRawFd, RawFd}; use xz2::{ stream::{Check as LzmaCheck, Filters as LzmaFilters, LzmaOptions, Stream as LzmaStream}, write::{XzDecoder, XzEncoder}, }; use zopfli::{BlockType, GzipEncoder as ZopFliEncoder, Options as ZopfliOptions}; pub trait WriteFinish: Write { fn finish(self: Box) -> std::io::Result; } // Boilerplate for existing types macro_rules! finish_impl { ($($t:ty),*) => {$( impl WriteFinish for $t { fn finish(self: Box) -> std::io::Result { Self::finish(*self) } } )*} } finish_impl!(GzEncoder, MultiGzDecoder, BzEncoder, XzEncoder); macro_rules! finish_impl_ref { ($($t:ty),*) => {$( impl WriteFinish for $t { fn finish(mut self: Box) -> std::io::Result { Self::finish(self.as_mut()) } } )*} } finish_impl_ref!(BzDecoder, XzDecoder); impl WriteFinish for BufWriter> { fn finish(self: Box) -> std::io::Result { let inner = self.into_inner()?; ZopFliEncoder::finish(inner) } } impl WriteFinish for LZ4FrameEncoder { fn finish(self: Box) -> std::io::Result { let (w, r) = Self::finish(*self); r?; Ok(w) } } // Adapt Reader to Writer // In case some decoders don't support the Write trait, instead of pushing data into the // decoder, we have no choice but to pull data out of it. So first, we create a "fake" reader // that does not own any data as a placeholder. In the Writer adapter struct, when data // is fed in, we call FakeReader::set_data to forward this data as the "source" of the // decoder. Next, we pull data out of the decoder, and finally, forward the decoded data to output. struct FakeReader(Cell<&'static [u8]>); impl FakeReader { fn new() -> FakeReader { FakeReader(Cell::new(&[])) } // SAFETY: the lifetime of the buffer is between the invocation of // this method and the invocation of FakeReader::clear. There is currently // no way to represent this with Rust's lifetime marker, so we transmute all // lifetimes away and make the users of this struct manually manage the lifetime. // It is the responsibility of the caller to ensure the underlying reference does not // live longer than it should. unsafe fn set_data(&self, data: &[u8]) { let buf: &'static [u8] = unsafe { std::mem::transmute(data) }; self.0.set(buf) } fn clear(&self) { self.0.set(&[]) } } impl Read for FakeReader { fn read(&mut self, buf: &mut [u8]) -> std::io::Result { let data = self.0.get(); let len = std::cmp::min(buf.len(), data.len()); buf[..len].copy_from_slice(&data[..len]); self.0.set(&data[len..]); Ok(len) } } // LZ4FrameDecoder struct LZ4FrameDecoder { write: W, decoder: lz4::Decoder, } impl LZ4FrameDecoder { fn new(write: W) -> Self { let fake = FakeReader::new(); let decoder = lz4::Decoder::new(fake).unwrap(); LZ4FrameDecoder { write, decoder } } } impl Write for LZ4FrameDecoder { fn write(&mut self, buf: &[u8]) -> std::io::Result { self.write_all(buf)?; Ok(buf.len()) } fn flush(&mut self) -> std::io::Result<()> { Ok(()) } fn write_all(&mut self, buf: &[u8]) -> std::io::Result<()> { // SAFETY: buf is removed from the reader immediately after usage unsafe { self.decoder.reader().set_data(buf) }; std::io::copy(&mut self.decoder, &mut self.write)?; self.decoder.reader().clear(); Ok(()) } } impl WriteFinish for LZ4FrameDecoder { fn finish(self: Box) -> std::io::Result { let (_, r) = self.decoder.finish(); r?; Ok(self.write) } } // LZ4BlockArchive format // // len: | 4 | 4 | n | ... | 4 | // data: | magic | compressed block size | compressed block data | ... | total uncompressed size | // LZ4BlockEncoder const LZ4_BLOCK_SIZE: usize = 0x800000; const LZ4HC_CLEVEL_MAX: i32 = 12; const LZ4_MAGIC: &[u8] = b"\x02\x21\x4c\x18"; struct LZ4BlockEncoder { write: W, chunker: Chunker, out_buf: Box<[u8]>, total: u32, is_lg: bool, } impl LZ4BlockEncoder { fn new(write: W, is_lg: bool) -> Self { let out_sz = lz4::block::compress_bound(LZ4_BLOCK_SIZE).unwrap_or(LZ4_BLOCK_SIZE); LZ4BlockEncoder { write, chunker: Chunker::new(LZ4_BLOCK_SIZE), // SAFETY: all bytes will be initialized before it is used out_buf: unsafe { Box::new_uninit_slice(out_sz).assume_init() }, total: 0, is_lg, } } fn encode_block(write: &mut W, out_buf: &mut [u8], chunk: &[u8]) -> std::io::Result<()> { let compressed_size = lz4::block::compress_to_buffer( chunk, Some(CompressionMode::HIGHCOMPRESSION(LZ4HC_CLEVEL_MAX)), false, out_buf, )?; let block_size = compressed_size as u32; write.write_pod(&block_size)?; write.write_all(&out_buf[..compressed_size]) } } impl Write for LZ4BlockEncoder { fn write(&mut self, buf: &[u8]) -> std::io::Result { self.write_all(buf)?; Ok(buf.len()) } fn flush(&mut self) -> std::io::Result<()> { Ok(()) } fn write_all(&mut self, mut buf: &[u8]) -> std::io::Result<()> { if self.total == 0 { // Write header self.write.write_all(LZ4_MAGIC)?; } self.total += buf.len() as u32; while !buf.is_empty() { let (b, chunk) = self.chunker.add_data(buf); buf = b; if let Some(chunk) = chunk { Self::encode_block(&mut self.write, &mut self.out_buf, chunk)?; } } Ok(()) } } impl WriteFinish for LZ4BlockEncoder { fn finish(mut self: Box) -> std::io::Result { let chunk = self.chunker.get_available(); if !chunk.is_empty() { Self::encode_block(&mut self.write, &mut self.out_buf, chunk)?; } if self.is_lg { self.write.write_pod(&self.total)?; } Ok(self.write) } } // LZ4BlockDecoder struct LZ4BlockDecoder { write: W, chunker: Chunker, out_buf: Box<[u8]>, curr_block_size: usize, } impl LZ4BlockDecoder { fn new(write: W) -> Self { LZ4BlockDecoder { write, chunker: Chunker::new(size_of::()), // SAFETY: all bytes will be initialized before it is used out_buf: unsafe { Box::new_uninit_slice(LZ4_BLOCK_SIZE).assume_init() }, curr_block_size: 0, } } fn decode_block(write: &mut W, out_buf: &mut [u8], chunk: &[u8]) -> std::io::Result<()> { let decompressed_size = lz4::block::decompress_to_buffer(chunk, Some(LZ4_BLOCK_SIZE as i32), out_buf)?; write.write_all(&out_buf[..decompressed_size]) } } impl Write for LZ4BlockDecoder { fn write(&mut self, buf: &[u8]) -> std::io::Result { self.write_all(buf)?; Ok(buf.len()) } fn flush(&mut self) -> std::io::Result<()> { Ok(()) } fn write_all(&mut self, mut buf: &[u8]) -> std::io::Result<()> { while !buf.is_empty() { let (b, chunk) = self.chunker.add_data(buf); buf = b; if let Some(chunk) = chunk { if chunk == LZ4_MAGIC { // Skip magic, read next u32 continue; } if self.curr_block_size == 0 { // We haven't got the current block size yet, try read it let mut next_u32: u32 = 0; bytes_of_mut(&mut next_u32).copy_from_slice(chunk); if next_u32 > lz4::block::compress_bound(LZ4_BLOCK_SIZE)? as u32 { // This is the LG format trailer, EOF continue; } // Update chunker to read next block self.curr_block_size = next_u32 as usize; self.chunker.set_chunk_size(self.curr_block_size); continue; } // Actually decode the block Self::decode_block(&mut self.write, &mut self.out_buf, chunk)?; // Reset for the next block self.curr_block_size = 0; self.chunker.set_chunk_size(size_of::()); } } Ok(()) } } impl WriteFinish for LZ4BlockDecoder { fn finish(mut self: Box) -> std::io::Result { let chunk = self.chunker.get_available(); if !chunk.is_empty() { return Err(std::io::Error::new( std::io::ErrorKind::Interrupted, "Finish ran before end of compressed stream", )); } Ok(self.write) } } // Top-level APIs pub fn get_encoder<'a, W: Write + 'a>(format: FileFormat, w: W) -> Box + 'a> { match format { FileFormat::XZ => { let opt = LzmaOptions::new_preset(9).unwrap(); let stream = LzmaStream::new_stream_encoder(LzmaFilters::new().lzma2(&opt), LzmaCheck::Crc32) .unwrap(); Box::new(XzEncoder::new_stream(w, stream)) } FileFormat::LZMA => { let opt = LzmaOptions::new_preset(9).unwrap(); let stream = LzmaStream::new_lzma_encoder(&opt).unwrap(); Box::new(XzEncoder::new_stream(w, stream)) } FileFormat::BZIP2 => Box::new(BzEncoder::new(w, BzCompression::best())), FileFormat::LZ4 => { let encoder = LZ4FrameEncoderBuilder::new() .block_size(BlockSize::Max4MB) .block_mode(BlockMode::Independent) .checksum(ContentChecksum::ChecksumEnabled) .block_checksum(BlockChecksum::BlockChecksumEnabled) .level(9) .auto_flush(true) .build(w) .unwrap(); Box::new(encoder) } FileFormat::LZ4_LEGACY => Box::new(LZ4BlockEncoder::new(w, false)), FileFormat::LZ4_LG => Box::new(LZ4BlockEncoder::new(w, true)), FileFormat::ZOPFLI => { // These options are already better than gzip -9 let opt = ZopfliOptions { iteration_count: NonZeroU64::new(1).unwrap(), maximum_block_splits: 1, ..Default::default() }; Box::new(ZopFliEncoder::new_buffered(opt, BlockType::Dynamic, w).unwrap()) } FileFormat::GZIP => Box::new(GzEncoder::new(w, GzCompression::best())), _ => unreachable!(), } } pub fn get_decoder<'a, W: Write + 'a>(format: FileFormat, w: W) -> Box + 'a> { match format { FileFormat::XZ | FileFormat::LZMA => { let stream = LzmaStream::new_auto_decoder(u64::MAX, 0).unwrap(); Box::new(XzDecoder::new_stream(w, stream)) } FileFormat::BZIP2 => Box::new(BzDecoder::new(w)), FileFormat::LZ4 => Box::new(LZ4FrameDecoder::new(w)), FileFormat::LZ4_LG | FileFormat::LZ4_LEGACY => Box::new(LZ4BlockDecoder::new(w)), FileFormat::ZOPFLI | FileFormat::GZIP => Box::new(MultiGzDecoder::new(w)), _ => unreachable!(), } } // C++ FFI pub fn compress_fd(format: FileFormat, in_fd: RawFd, out_fd: RawFd) { let mut in_file = unsafe { ManuallyDrop::new(File::from_raw_fd(in_fd)) }; let mut out_file = unsafe { ManuallyDrop::new(File::from_raw_fd(out_fd)) }; let mut encoder = get_encoder(format, out_file.deref_mut()); let _: LoggedResult<()> = try { std::io::copy(in_file.deref_mut(), encoder.as_mut())?; encoder.finish()?; }; } pub fn decompress_bytes_fd(format: FileFormat, in_bytes: &[u8], in_fd: RawFd, out_fd: RawFd) { let mut in_file = unsafe { ManuallyDrop::new(File::from_raw_fd(in_fd)) }; let mut out_file = unsafe { ManuallyDrop::new(File::from_raw_fd(out_fd)) }; let mut decoder = get_decoder(format, out_file.deref_mut()); let _: LoggedResult<()> = try { decoder.write_all(in_bytes)?; std::io::copy(in_file.deref_mut(), decoder.as_mut())?; decoder.finish()?; }; } pub fn compress_bytes(format: FileFormat, in_bytes: &[u8], out_fd: RawFd) { let mut out_file = unsafe { ManuallyDrop::new(File::from_raw_fd(out_fd)) }; let mut encoder = get_encoder(format, out_file.deref_mut()); let _: LoggedResult<()> = try { encoder.write_all(in_bytes)?; encoder.finish()?; }; } pub fn decompress_bytes(format: FileFormat, in_bytes: &[u8], out_fd: RawFd) { let mut out_file = unsafe { ManuallyDrop::new(File::from_raw_fd(out_fd)) }; let mut decoder = get_decoder(format, out_file.deref_mut()); let _: LoggedResult<()> = try { decoder.write_all(in_bytes)?; decoder.finish()?; }; } enum AsRawFdFile { Stdin(Stdin), Stdout(Stdout), File(File), } impl AsRawFd for AsRawFdFile { fn as_raw_fd(&self) -> RawFd { match self { AsRawFdFile::Stdin(stdin) => stdin.as_raw_fd(), AsRawFdFile::Stdout(stdout) => stdout.as_raw_fd(), AsRawFdFile::File(file) => file.as_raw_fd(), } } } pub(crate) fn decompress(infile: &mut String, outfile: Option<&mut String>) -> LoggedResult<()> { let in_std = infile == "-"; let mut rm_in = false; let raw_in = if in_std { AsRawFdFile::Stdin(stdin()) } else { AsRawFdFile::File(Utf8CStr::from_string(infile).open(O_RDONLY)?) }; let mut buf = [0u8; 4096]; let mut in_file = unsafe { File::from_raw_fd(raw_in.as_raw_fd()) }; let _ = in_file.read(&mut buf)?; let format = check_fmt(&buf); eprintln!("Detected format: {format}"); if !format.is_compressed() { return Err(log_err!("Input file is not a supported type!")); } let raw_out = if let Some(outfile) = outfile { if outfile == "-" { AsRawFdFile::Stdout(stdout()) } else { AsRawFdFile::File(Utf8CStr::from_string(outfile).create(O_WRONLY | O_TRUNC, 0o644)?) } } else if in_std { AsRawFdFile::Stdout(stdout()) } else { // strip the extension rm_in = true; let mut outfile = if let Some((outfile, ext)) = infile.rsplit_once('.') { if ext != format.ext() { Err(log_err!("Input file is not a supported type!"))?; } outfile.to_owned() } else { infile.clone() }; eprintln!("Decompressing to [{outfile}]"); AsRawFdFile::File(Utf8CStr::from_string(&mut outfile).create(O_WRONLY | O_TRUNC, 0o644)?) }; decompress_bytes_fd(format, &buf, raw_in.as_raw_fd(), raw_out.as_raw_fd()); if rm_in { Utf8CStr::from_string(infile).remove()?; } Ok(()) } pub(crate) fn compress( method: FileFormat, infile: &mut String, outfile: Option<&mut String>, ) -> LoggedResult<()> { if method == FileFormat::UNKNOWN { error!("Unsupported compression format"); } let in_std = infile == "-"; let mut rm_in = false; let raw_in = if in_std { AsRawFdFile::Stdin(stdin()) } else { AsRawFdFile::File(Utf8CStr::from_string(infile).open(O_RDONLY)?) }; let raw_out = if let Some(outfile) = outfile { if outfile == "-" { AsRawFdFile::Stdout(stdout()) } else { AsRawFdFile::File(Utf8CStr::from_string(outfile).create(O_WRONLY | O_TRUNC, 0o644)?) } } else if in_std { AsRawFdFile::Stdout(stdout()) } else { let mut outfile = format!("{infile}.{}", method.ext()); eprintln!("Compressing to [{outfile}]"); rm_in = true; AsRawFdFile::File(Utf8CStr::from_string(&mut outfile).create(O_WRONLY | O_TRUNC, 0o644)?) }; compress_fd(method, raw_in.as_raw_fd(), raw_out.as_raw_fd()); if rm_in { Utf8CStr::from_string(infile).remove()?; } Ok(()) }