From 8e4bf87f2ea38fff3961b2708a01b53ce010a338 Mon Sep 17 00:00:00 2001 From: Fred Robinson Date: Tue, 17 Mar 2020 21:40:40 -0700 Subject: [PATCH 01/14] add ReaderAt --- reader_at.go | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 reader_at.go diff --git a/reader_at.go b/reader_at.go new file mode 100644 index 0000000..250c7ea --- /dev/null +++ b/reader_at.go @@ -0,0 +1,7 @@ +package xz + +// ReaderAtConfig defines the parameters for the xz readerat. +type ReaderAtConfig struct { + DictCap int + SingleStream bool +} From 67736ad8722f333ecb8c19217ceb81867c04e748 Mon Sep 17 00:00:00 2001 From: Fred Robinson Date: Sun, 22 Mar 2020 23:00:26 -0700 Subject: [PATCH 02/14] building skeleton with one test --- .gitignore | 1 + reader_at.go | 82 +++++++++++++++++++++++++++++++++++++++++++++-- reader_at_test.go | 31 ++++++++++++++++++ 3 files changed, 111 insertions(+), 3 deletions(-) create mode 100644 reader_at_test.go diff --git a/.gitignore b/.gitignore index e3c2fc2..d4db283 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,4 @@ cmd/xb/xb # default compression test file enwik8* +/fox diff --git a/reader_at.go b/reader_at.go index 250c7ea..2bee031 100644 --- a/reader_at.go +++ b/reader_at.go @@ -1,7 +1,83 @@ package xz +import ( + "io" + "sync" +) + // ReaderAtConfig defines the parameters for the xz readerat. -type ReaderAtConfig struct { - DictCap int - SingleStream bool +type ReaderAtConfig struct{} + +// Verify checks the reader config for validity. Zero values will be replaced by +// default values. +func (c *ReaderAtConfig) Verify() error { + // if c == nil { + // return errors.New("xz: reader parameters are nil") + // } + return nil +} + +// ReaderAt supports the reading of one or multiple xz streams. +type ReaderAt struct { + conf ReaderAtConfig + + xz io.ReaderAt +} + +// NewReader creates a new xz reader using the default parameters. +// The function reads and checks the header of the first XZ stream. The +// reader will process multiple streams including padding. +func NewReaderAt(xz io.ReaderAt) (r *ReaderAt, err error) { + return ReaderAtConfig{}.NewReaderAt(xz) +} + +// NewReaderAt creates an xz stream reader. +func (c ReaderAtConfig) NewReaderAt(xz io.ReaderAt) (*ReaderAt, error) { + if err := c.Verify(); err != nil { + return nil, err + } + + r := &ReaderAt{ + conf: c, + xz: xz, + } + + if err := r.init(); err != nil { + return nil, err + } + + return r, nil + +} + +func (r *ReaderAt) init() error { + return nil +} + +func (r *ReaderAt) ReadAt(p []byte, offset int64) (int, error) { + return 1, io.EOF +} + +// rat wraps a ReaderAt to fulfill the io.Reader interface. +type rat struct { + *sync.Mutex + offset int64 + reader io.ReaderAt +} + +func (r *rat) Read(p []byte) (int, error) { + r.Lock() + defer r.Unlock() + + n, err := r.reader.ReadAt(p, r.offset) + r.offset += int64(n) + return n, err +} + +func newRat(ra io.ReaderAt, offset int64) *rat { + return &rat{ + Mutex: &sync.Mutex{}, + offset: offset, + reader: ra, + } } diff --git a/reader_at_test.go b/reader_at_test.go new file mode 100644 index 0000000..af57d23 --- /dev/null +++ b/reader_at_test.go @@ -0,0 +1,31 @@ +package xz + +import ( + "bytes" + "io" + "os" + "testing" +) + +func TestReaderAtSimple(t *testing.T) { + const file = "fox.xz" + xz, err := os.Open(file) + if err != nil { + t.Fatalf("os.Open(%q) error %s", file, err) + } + r, err := NewReaderAt(xz) + if err != nil { + t.Fatalf("NewReader error %s", err) + } + var buf bytes.Buffer + reader := newRat(r, 0) + if _, err = io.Copy(&buf, reader); err != nil { + t.Fatalf("io.Copy error %s", err) + } + + bufStr := buf.String() + expected := "The qubasdf" // fixme + if bufStr != expected { + t.Fatalf("Unexpected decompression output. \"%s\" != \"%s\"", bufStr, expected) + } +} From dc0fb91dd38bd1773d6baa43cc27e7417b209ced Mon Sep 17 00:00:00 2001 From: Fred Robinson Date: Mon, 23 Mar 2020 21:38:49 -0700 Subject: [PATCH 03/14] factor out readFooter function --- reader.go | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/reader.go b/reader.go index 22cd6d5..64c4524 100644 --- a/reader.go +++ b/reader.go @@ -188,15 +188,8 @@ func (r *streamReader) readTail() error { } } - p := make([]byte, footerLen) - if _, err = io.ReadFull(r.xz, p); err != nil { - if err == io.EOF { - err = io.ErrUnexpectedEOF - } - return err - } - var f footer - if err = f.UnmarshalBinary(p); err != nil { + f, err := readFooter(r.xz) + if err != nil { return err } xlog.Debugf("xz footer %s", f) @@ -209,6 +202,22 @@ func (r *streamReader) readTail() error { return nil } +func readFooter(r io.Reader) (*footer, error) { + p := make([]byte, footerLen) + if _, err := io.ReadFull(r, p); err != nil { + if err == io.EOF { + err = io.ErrUnexpectedEOF + } + return nil, err + } + var f footer + if err := f.UnmarshalBinary(p); err != nil { + return nil, err + } + + return &f, nil +} + // Read reads actual data from the xz stream. func (r *streamReader) Read(p []byte) (n int, err error) { for n < len(p) { From 03f313085586cc98027ab83a4902d44dc0b5b51d Mon Sep 17 00:00:00 2001 From: Fred Robinson Date: Mon, 23 Mar 2020 22:29:34 -0700 Subject: [PATCH 04/14] start reading index --- format.go | 7 +++---- reader_at.go | 27 ++++++++++++++++++++++++++- reader_at_test.go | 11 ++++++++++- 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/format.go b/format.go index edfec9a..45b7f25 100644 --- a/format.go +++ b/format.go @@ -193,7 +193,7 @@ func (f footer) String() string { // Minimum and maximum for the size of the index (backward size). const ( minIndexSize = 4 - maxIndexSize = (1 << 32) * 4 + maxIndexSize = 1 << 32 * 4 ) // MarshalBinary converts footer values into an xz file footer. Note @@ -213,7 +213,7 @@ func (f *footer) MarshalBinary() (data []byte, err error) { data = make([]byte, footerLen) // backward size (index size) - s := (f.indexSize / 4) - 1 + s := f.indexSize/4 - 1 putUint32LE(data[4:], uint32(s)) // flags data[9] = f.flags @@ -228,8 +228,7 @@ func (f *footer) MarshalBinary() (data []byte, err error) { return data, nil } -// UnmarshalBinary sets the footer value by unmarshalling an xz file -// footer. +// UnmarshalBinary sets the footer value by unmarshalling an xz file footer. func (f *footer) UnmarshalBinary(data []byte) error { if len(data) != footerLen { return errors.New("xz: wrong footer length") diff --git a/reader_at.go b/reader_at.go index 2bee031..4e8646a 100644 --- a/reader_at.go +++ b/reader_at.go @@ -2,11 +2,14 @@ package xz import ( "io" + "log" "sync" ) // ReaderAtConfig defines the parameters for the xz readerat. -type ReaderAtConfig struct{} +type ReaderAtConfig struct { + Len int64 +} // Verify checks the reader config for validity. Zero values will be replaced by // default values. @@ -21,6 +24,8 @@ func (c *ReaderAtConfig) Verify() error { type ReaderAt struct { conf ReaderAtConfig + len int64 + xz io.ReaderAt } @@ -51,6 +56,26 @@ func (c ReaderAtConfig) NewReaderAt(xz io.ReaderAt) (*ReaderAt, error) { } func (r *ReaderAt) init() error { + r.len = r.conf.Len + if r.len < 1 { + panic("todo: implement probing for Len") + } + + footerOffset := r.len - footerLen + f, err := readFooter(newRat(r.xz, footerOffset)) + if err != nil { + return err + } + + indexOffset := footerOffset - f.indexSize + indexOffset++ // readIndexBody assumes the indicator byte has already been read + index, _, err := readIndexBody(newRat(r.xz, indexOffset)) + if err != nil { + return err + } + + log.Fatal(index) + return nil } diff --git a/reader_at_test.go b/reader_at_test.go index af57d23..13c3471 100644 --- a/reader_at_test.go +++ b/reader_at_test.go @@ -13,7 +13,16 @@ func TestReaderAtSimple(t *testing.T) { if err != nil { t.Fatalf("os.Open(%q) error %s", file, err) } - r, err := NewReaderAt(xz) + + info, err := os.Stat(file) + if err != nil { + t.Fatalf("os.Stat(%q) error %s", file, err) + } + + conf := ReaderAtConfig{ + Len: info.Size(), + } + r, err := conf.NewReaderAt(xz) if err != nil { t.Fatalf("NewReader error %s", err) } From 7782d18b3532aa70659e4a45f64bd19ac729b3e2 Mon Sep 17 00:00:00 2001 From: Fred Robinson Date: Mon, 23 Mar 2020 23:19:28 -0700 Subject: [PATCH 05/14] reads an index --- reader_at.go | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/reader_at.go b/reader_at.go index 4e8646a..cc32bd5 100644 --- a/reader_at.go +++ b/reader_at.go @@ -24,7 +24,9 @@ func (c *ReaderAtConfig) Verify() error { type ReaderAt struct { conf ReaderAtConfig - len int64 + // len of the contents of the underlying xz data + len int64 + indices []index xz io.ReaderAt } @@ -43,8 +45,10 @@ func (c ReaderAtConfig) NewReaderAt(xz io.ReaderAt) (*ReaderAt, error) { } r := &ReaderAt{ - conf: c, - xz: xz, + conf: c, + len: 0, + indices: []index{}, + xz: xz, } if err := r.init(); err != nil { @@ -55,6 +59,25 @@ func (c ReaderAtConfig) NewReaderAt(xz io.ReaderAt) (*ReaderAt, error) { } +type index struct { + startOffset int64 + rs []record +} + +func (i index) compressedBufferedSize() int64 { + size := int64(0) + for _, r := range i.rs { + unpadded := r.unpaddedSize + padded := 4 * (unpadded / 4) + if unpadded < padded { + padded += 4 + } + + size += padded + } + return size +} + func (r *ReaderAt) init() error { r.len = r.conf.Len if r.len < 1 { @@ -69,17 +92,22 @@ func (r *ReaderAt) init() error { indexOffset := footerOffset - f.indexSize indexOffset++ // readIndexBody assumes the indicator byte has already been read - index, _, err := readIndexBody(newRat(r.xz, indexOffset)) + indexRecs, _, err := readIndexBody(newRat(r.xz, indexOffset)) if err != nil { return err } - log.Fatal(index) + ix := index{ + rs: indexRecs, + } + ix.startOffset = indexOffset - ix.compressedBufferedSize() + r.indices = append(r.indices, ix) return nil } func (r *ReaderAt) ReadAt(p []byte, offset int64) (int, error) { + log.Fatal(r) return 1, io.EOF } From 6a023ec476aaf2233938334a3be423bd41d20fdf Mon Sep 17 00:00:00 2001 From: frederickrobinson Date: Sun, 5 Apr 2020 17:28:57 -0700 Subject: [PATCH 06/14] simple read test passes --- format.go | 35 ++++++++++--- format_test.go | 4 +- reader.go | 20 ++------ reader_at.go | 126 +++++++++++++++++++++++++++++++++++++--------- reader_at_test.go | 2 +- writer.go | 4 +- 6 files changed, 140 insertions(+), 51 deletions(-) diff --git a/format.go b/format.go index 45b7f25..9986dab 100644 --- a/format.go +++ b/format.go @@ -101,8 +101,8 @@ func newHashFunc(flags byte) (newHash func() hash.Hash, err error) { return } -// header provides the actual content of the xz file header: the flags. -type header struct { +// streamHeader provides the actual content of the xz stream: the flags. +type streamHeader struct { flags byte } @@ -112,18 +112,36 @@ var errHeaderMagic = errors.New("xz: invalid header magic bytes") // ValidHeader checks whether data is a correct xz file header. The // length of data must be HeaderLen. func ValidHeader(data []byte) bool { - var h header + var h streamHeader err := h.UnmarshalBinary(data) return err == nil } // String returns a string representation of the flags. -func (h header) String() string { +func (h streamHeader) String() string { return flagString(h.flags) } +func (h *streamHeader) UnmarshalReader(xz io.Reader) error { + data := make([]byte, HeaderLen) + if _, err := io.ReadFull(xz, data[:4]); err != nil { + return err + } + if bytes.Equal(data[:4], []byte{0, 0, 0, 0}) { + return errPadding + } + if _, err := io.ReadFull(xz, data[4:]); err != nil { + if err == io.EOF { + err = io.ErrUnexpectedEOF + } + return err + } + + return h.UnmarshalBinary(data) +} + // UnmarshalBinary reads header from the provided data slice. -func (h *header) UnmarshalBinary(data []byte) error { +func (h *streamHeader) UnmarshalBinary(data []byte) error { // header length if len(data) != HeaderLen { return errors.New("xz: wrong file header length") @@ -155,7 +173,7 @@ func (h *header) UnmarshalBinary(data []byte) error { } // MarshalBinary generates the xz file header. -func (h *header) MarshalBinary() (data []byte, err error) { +func (h *streamHeader) MarshalBinary() (data []byte, err error) { if err = verifyFlags(h.flags); err != nil { return nil, err } @@ -626,6 +644,11 @@ func (rec *record) MarshalBinary() (data []byte, err error) { return p[:n], nil } +// paddedLen returns the padded length of the compressed record. +func (rec *record) paddedLen() int64 { + return int64(padLen(rec.unpaddedSize)) + rec.unpaddedSize +} + // writeIndex writes the index, a sequence of records. func writeIndex(w io.Writer, index []record) (n int64, err error) { crc := crc32.NewIEEE() diff --git a/format_test.go b/format_test.go index 0b875d3..a336a80 100644 --- a/format_test.go +++ b/format_test.go @@ -10,12 +10,12 @@ import ( ) func TestHeader(t *testing.T) { - h := header{flags: CRC32} + h := streamHeader{flags: CRC32} data, err := h.MarshalBinary() if err != nil { t.Fatalf("MarshalBinary error %s", err) } - var g header + var g streamHeader if err = g.UnmarshalBinary(data); err != nil { t.Fatalf("UnmarshalBinary error %s", err) } diff --git a/reader.go b/reader.go index 64c4524..53a2125 100644 --- a/reader.go +++ b/reader.go @@ -61,7 +61,7 @@ type streamReader struct { xz io.Reader br *blockReader newHash func() hash.Hash - h header + h streamHeader index []record } @@ -137,27 +137,17 @@ func (c ReaderConfig) newStreamReader(xz io.Reader) (r *streamReader, err error) if err = c.Verify(); err != nil { return nil, err } - data := make([]byte, HeaderLen) - if _, err := io.ReadFull(xz, data[:4]); err != nil { - return nil, err - } - if bytes.Equal(data[:4], []byte{0, 0, 0, 0}) { - return nil, errPadding - } - if _, err = io.ReadFull(xz, data[4:]); err != nil { - if err == io.EOF { - err = io.ErrUnexpectedEOF - } - return nil, err - } + r = &streamReader{ ReaderConfig: c, xz: xz, index: make([]record, 0, 4), } - if err = r.h.UnmarshalBinary(data); err != nil { + + if err := r.h.UnmarshalReader(xz); err != nil { return nil, err } + xlog.Debugf("xz header %s", r.h) if r.newHash, err = newHashFunc(r.h.flags); err != nil { return nil, err diff --git a/reader_at.go b/reader_at.go index cc32bd5..dbe1d6c 100644 --- a/reader_at.go +++ b/reader_at.go @@ -1,9 +1,11 @@ package xz import ( + "fmt" "io" - "log" "sync" + + "github.com/ulikunitz/xz/internal/xlog" ) // ReaderAtConfig defines the parameters for the xz readerat. @@ -24,11 +26,11 @@ func (c *ReaderAtConfig) Verify() error { type ReaderAt struct { conf ReaderAtConfig - // len of the contents of the underlying xz data - len int64 indices []index - xz io.ReaderAt + // len of the contents of the underlying xz data + len int64 + xz io.ReaderAt } // NewReader creates a new xz reader using the default parameters. @@ -51,7 +53,7 @@ func (c ReaderAtConfig) NewReaderAt(xz io.ReaderAt) (*ReaderAt, error) { xz: xz, } - if err := r.init(); err != nil { + if err := r.setup(); err != nil { return nil, err } @@ -59,26 +61,23 @@ func (c ReaderAtConfig) NewReaderAt(xz io.ReaderAt) (*ReaderAt, error) { } +// An index carries all the information necessary for reading randomly into a +// single stream. type index struct { - startOffset int64 - rs []record + blockStartOffset int64 + streamHeader streamHeader + records []record } func (i index) compressedBufferedSize() int64 { size := int64(0) - for _, r := range i.rs { - unpadded := r.unpaddedSize - padded := 4 * (unpadded / 4) - if unpadded < padded { - padded += 4 - } - - size += padded + for _, r := range i.records { + size += r.paddedLen() } return size } -func (r *ReaderAt) init() error { +func (r *ReaderAt) setup() error { r.len = r.conf.Len if r.len < 1 { panic("todo: implement probing for Len") @@ -90,25 +89,102 @@ func (r *ReaderAt) init() error { return err } - indexOffset := footerOffset - f.indexSize - indexOffset++ // readIndexBody assumes the indicator byte has already been read - indexRecs, _, err := readIndexBody(newRat(r.xz, indexOffset)) + indexStartOffset := footerOffset - f.indexSize + + // readIndexBody assumes the indicator byte has already been read + indexRecs, _, err := readIndexBody(newRat(r.xz, indexStartOffset+1)) if err != nil { return err } ix := index{ - rs: indexRecs, + records: indexRecs, } - ix.startOffset = indexOffset - ix.compressedBufferedSize() - r.indices = append(r.indices, ix) + ix.blockStartOffset = indexStartOffset - ix.compressedBufferedSize() + r.indices = append([]index{ix}, r.indices...) + + sh := streamHeader{} + headerStartOffset := ix.blockStartOffset - HeaderLen + err = sh.UnmarshalReader(newRat(r.xz, headerStartOffset)) + if err != nil { + return fmt.Errorf("trouble reading stream header at offset %d: %v", headerStartOffset, err) + } + ix.streamHeader = sh + + xlog.Debugf("xz indices %+v", r.indices) return nil } -func (r *ReaderAt) ReadAt(p []byte, offset int64) (int, error) { - log.Fatal(r) - return 1, io.EOF +func (r *ReaderAt) ReadAt(p []byte, bufferPos int64) (int, error) { + lenRequested := len(p) + + indicesPos := int64(0) + + for _, index := range r.indices { + blockOffset := index.blockStartOffset + + for _, block := range index.records { + if indicesPos <= bufferPos && bufferPos <= indicesPos+block.uncompressedSize { + blockStartPos := bufferPos - indicesPos + blockEndPos := blockStartPos + int64(len(p)) + if blockEndPos > block.uncompressedSize { + blockEndPos = block.uncompressedSize + } + blockAmtToRead := blockEndPos - blockStartPos + + r.readBlockAt( + p[:blockAmtToRead], blockStartPos, + blockOffset, block.unpaddedSize, index.streamHeader.flags) + p = p[blockAmtToRead:] + bufferPos += blockAmtToRead + } + + blockOffset += block.paddedLen() + indicesPos += block.uncompressedSize + } + } + + var err error + if len(p) != 0 { + err = io.EOF + } + return lenRequested - len(p), err +} + +func (r *ReaderAt) readBlockAt( + p []byte, bufferPos int64, + blockOffset, blockLen int64, streamFlags byte, +) error { + viewStart := rat{ + Mutex: &sync.Mutex{}, + offset: blockOffset, + reader: r.xz, + } + + view := io.LimitReader(&viewStart, blockLen) + + blockHeader, hlen, err := readBlockHeader(view) + if err != nil { + return err + } + + readerConfig := ReaderConfig{} + + hashFn, err := newHashFunc(streamFlags) + if err != nil { + return err + } + blockReader, err := readerConfig.newBlockReader(view, blockHeader, hlen, hashFn()) + + trash := make([]byte, bufferPos) + _, err = io.ReadFull(blockReader, trash) + if err != nil { + return err + } + + _, err = io.ReadFull(blockReader, p) + return err } // rat wraps a ReaderAt to fulfill the io.Reader interface. diff --git a/reader_at_test.go b/reader_at_test.go index 13c3471..916710a 100644 --- a/reader_at_test.go +++ b/reader_at_test.go @@ -33,7 +33,7 @@ func TestReaderAtSimple(t *testing.T) { } bufStr := buf.String() - expected := "The qubasdf" // fixme + expected := "The quick brown fox jumps over the lazy dog.\n" // fixme if bufStr != expected { t.Fatalf("Unexpected decompression output. \"%s\" != \"%s\"", bufStr, expected) } diff --git a/writer.go b/writer.go index aec10df..241d560 100644 --- a/writer.go +++ b/writer.go @@ -141,7 +141,7 @@ type Writer struct { xz io.Writer bw *blockWriter newHash func() hash.Hash - h header + h streamHeader index []record closed bool } @@ -183,7 +183,7 @@ func (c WriterConfig) NewWriter(xz io.Writer) (w *Writer, err error) { w = &Writer{ WriterConfig: c, xz: xz, - h: header{c.CheckSum}, + h: streamHeader{c.CheckSum}, index: make([]record, 0, 4), } if w.newHash, err = newHashFunc(c.CheckSum); err != nil { From 7222079d72f125452e57b496fe039c2a5e0c8751 Mon Sep 17 00:00:00 2001 From: Frederick Robinson Date: Sun, 5 Apr 2020 20:08:51 -0700 Subject: [PATCH 07/14] move test files --- reader_at_test.go | 6 +++++- reader_test.go | 8 ++++---- fox-check-none.xz => testfiles/fox-check-none.xz | Bin fox.xz => testfiles/fox.xz | Bin 4 files changed, 9 insertions(+), 5 deletions(-) rename fox-check-none.xz => testfiles/fox-check-none.xz (100%) rename fox.xz => testfiles/fox.xz (100%) diff --git a/reader_at_test.go b/reader_at_test.go index 916710a..be32bdb 100644 --- a/reader_at_test.go +++ b/reader_at_test.go @@ -8,7 +8,11 @@ import ( ) func TestReaderAtSimple(t *testing.T) { - const file = "fox.xz" + testFile(t, "testfiles/fox.xz") + testFile(t, "testfiles/fox-check-none.xz") +} + +func testFile(t *testing.T, file string) { xz, err := os.Open(file) if err != nil { t.Fatalf("os.Open(%q) error %s", file, err) diff --git a/reader_test.go b/reader_test.go index 45e725b..d1d28f7 100644 --- a/reader_test.go +++ b/reader_test.go @@ -13,7 +13,7 @@ import ( ) func TestReaderSimple(t *testing.T) { - const file = "fox.xz" + const file = "testfiles/fox.xz" xz, err := os.Open(file) if err != nil { t.Fatalf("os.Open(%q) error %s", file, err) @@ -29,7 +29,7 @@ func TestReaderSimple(t *testing.T) { } func TestReaderSingleStream(t *testing.T) { - data, err := ioutil.ReadFile("fox.xz") + data, err := ioutil.ReadFile("testfiles/fox.xz") if err != nil { t.Fatalf("ReadFile error %s", err) } @@ -56,7 +56,7 @@ func TestReaderSingleStream(t *testing.T) { } func TestReaderMultipleStreams(t *testing.T) { - data, err := ioutil.ReadFile("fox.xz") + data, err := ioutil.ReadFile("testfiles/fox.xz") if err != nil { t.Fatalf("ReadFile error %s", err) } @@ -81,7 +81,7 @@ func TestReaderMultipleStreams(t *testing.T) { } func TestCheckNone(t *testing.T) { - const file = "fox-check-none.xz" + const file = "testfiles/fox-check-none.xz" xz, err := os.Open(file) if err != nil { t.Fatalf("os.Open(%q) error %s", file, err) diff --git a/fox-check-none.xz b/testfiles/fox-check-none.xz similarity index 100% rename from fox-check-none.xz rename to testfiles/fox-check-none.xz diff --git a/fox.xz b/testfiles/fox.xz similarity index 100% rename from fox.xz rename to testfiles/fox.xz From db43713aaf766ca7ac9f2cfb70441db55b3a530d Mon Sep 17 00:00:00 2001 From: Frederick Robinson Date: Sun, 5 Apr 2020 20:22:32 -0700 Subject: [PATCH 08/14] add chunked test file --- reader_at_test.go | 13 +++++++------ testfiles/fox.blocks.xz | Bin 0 -> 336 bytes 2 files changed, 7 insertions(+), 6 deletions(-) create mode 100644 testfiles/fox.blocks.xz diff --git a/reader_at_test.go b/reader_at_test.go index be32bdb..f67dd1d 100644 --- a/reader_at_test.go +++ b/reader_at_test.go @@ -10,17 +10,18 @@ import ( func TestReaderAtSimple(t *testing.T) { testFile(t, "testfiles/fox.xz") testFile(t, "testfiles/fox-check-none.xz") + testFile(t, "testfiles/fox.blocks.xz") } -func testFile(t *testing.T, file string) { - xz, err := os.Open(file) +func testFile(t *testing.T, filePath string) { + xz, err := os.Open(filePath) if err != nil { - t.Fatalf("os.Open(%q) error %s", file, err) + t.Fatalf("os.Open(%q) error %s", filePath, err) } - info, err := os.Stat(file) + info, err := os.Stat(filePath) if err != nil { - t.Fatalf("os.Stat(%q) error %s", file, err) + t.Fatalf("os.Stat(%q) error %s", filePath, err) } conf := ReaderAtConfig{ @@ -37,7 +38,7 @@ func testFile(t *testing.T, file string) { } bufStr := buf.String() - expected := "The quick brown fox jumps over the lazy dog.\n" // fixme + expected := "The quick brown fox jumps over the lazy dog.\n" if bufStr != expected { t.Fatalf("Unexpected decompression output. \"%s\" != \"%s\"", bufStr, expected) } diff --git a/testfiles/fox.blocks.xz b/testfiles/fox.blocks.xz new file mode 100644 index 0000000000000000000000000000000000000000..f40cf8aba6f63207c1ddcf521d06b1d2cd98b997 GIT binary patch literal 336 zcmexsUKJ6=z`*kC+7>q^21Q0O1_p)_{ill=8CXIxQWXk;T!!QM*)w1BYMeonFU?HO zRshMLbWq~Hu%43{O+Kk8zdR2luWxq!?z@Bu;b`&-Y55fjAbI;*$ta~2GLO;Zvr2Od zib3-1yU&R_oAw2u$t&cSr51tYJ9qfmDQ=0oi7sCP^Z-a+?e&Jc326pBX!1FURh3}* yK9`H1zg}{7Mw3s;PuJrD>0kEs)H1Fc|JO5c%Ce%vAa1TI&M!&NSV0O|BBKBdKvSXs literal 0 HcmV?d00001 From ab374ddfe3e9645f8f530fd955ea5e7d9cfbf6e8 Mon Sep 17 00:00:00 2001 From: Frederick Robinson Date: Sun, 5 Apr 2020 21:38:07 -0700 Subject: [PATCH 09/14] handle trailing nulls --- reader_at.go | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/reader_at.go b/reader_at.go index dbe1d6c..f8ef269 100644 --- a/reader_at.go +++ b/reader_at.go @@ -83,7 +83,26 @@ func (r *ReaderAt) setup() error { panic("todo: implement probing for Len") } - footerOffset := r.len - footerLen + // read backwards past potential null bytes until we find the end of the + // footer + end := r.len - 1 + for end > 0 { + probe := make([]byte, 1) + n, err := r.xz.ReadAt(probe, end) + if err != nil { + return err + } + if n != len(probe) { + return fmt.Errorf("read %d bytes", n) + } + if probe[0] != 0 { + break + } + end-- + } + end++ + + footerOffset := end - footerLen f, err := readFooter(newRat(r.xz, footerOffset)) if err != nil { return err From ef84e9be714e3a5d76e748563630b9271a7b0d7d Mon Sep 17 00:00:00 2001 From: Frederick Robinson Date: Sun, 5 Apr 2020 21:38:32 -0700 Subject: [PATCH 10/14] add new failing test for multiple streams --- reader_at_test.go | 80 ++++++++++++++++++++++++++++++++++++++--------- reader_test.go | 26 +++++++++------ 2 files changed, 82 insertions(+), 24 deletions(-) diff --git a/reader_at_test.go b/reader_at_test.go index f67dd1d..a1e82e7 100644 --- a/reader_at_test.go +++ b/reader_at_test.go @@ -2,18 +2,53 @@ package xz import ( "bytes" - "io" + "io/ioutil" "os" "testing" ) +const expected = "The quick brown fox jumps over the lazy dog.\n" + +func TestReaderAtBlocks(t *testing.T) { + testFile(t, "testfiles/fox.blocks.xz", expected) +} + func TestReaderAtSimple(t *testing.T) { - testFile(t, "testfiles/fox.xz") - testFile(t, "testfiles/fox-check-none.xz") - testFile(t, "testfiles/fox.blocks.xz") + testFile(t, "testfiles/fox.xz", expected) +} + +func TestReaderAtMS(t *testing.T) { + expect := expected + expected + expected + expected + filePath := "testfiles/fox.blocks.xz" + + f, _ := testOpenFile(t, filePath) + fData, err := ioutil.ReadAll(f) + if err != nil { + t.Fatalf("Error reading file %s", err) + } + msBytes := testMultiStreams(fData) + msB := bytes.NewReader(msBytes) + + conf := ReaderAtConfig{ + Len: int64(len(msBytes)), + } + r, err := conf.NewReaderAt(msB) + if err != nil { + t.Fatalf("NewReaderAt error %s", err) + } + + reader := newRat(r, 0) + decompressedBytes, err := ioutil.ReadAll(reader) + if err != nil { + t.Fatalf("io.Copy error %s", err) + } + + if string(decompressedBytes) != expect { + t.Fatalf("Unexpected decompression output for reader %+v. \"%s\" != \"%s\"", r, string(decompressedBytes), expect) + } } -func testFile(t *testing.T, filePath string) { +func testOpenFile(t *testing.T, filePath string) (*os.File, int64) { xz, err := os.Open(filePath) if err != nil { t.Fatalf("os.Open(%q) error %s", filePath, err) @@ -24,22 +59,39 @@ func testFile(t *testing.T, filePath string) { t.Fatalf("os.Stat(%q) error %s", filePath, err) } + return xz, info.Size() +} + +func testFile(t *testing.T, filePath string, expected string) { + for i := 0; i < len(expected); i++ { + for n := 1; n+i < len(expected); n++ { + testFilePart(t, filePath, expected, i, n) + } + } +} + +func testFilePart(t *testing.T, filePath string, expected string, start, size int) { + f, fileSize := testOpenFile(t, filePath) + conf := ReaderAtConfig{ - Len: info.Size(), + Len: fileSize, } - r, err := conf.NewReaderAt(xz) + r, err := conf.NewReaderAt(f) if err != nil { t.Fatalf("NewReader error %s", err) } - var buf bytes.Buffer - reader := newRat(r, 0) - if _, err = io.Copy(&buf, reader); err != nil { + + decompressedBytes := make([]byte, size) + n, err := r.ReadAt(decompressedBytes, int64(start)) + if n != len(decompressedBytes) { + t.Fatalf("unexpectedly didn't read all") + } + if err != nil { t.Fatalf("io.Copy error %s", err) } - bufStr := buf.String() - expected := "The quick brown fox jumps over the lazy dog.\n" - if bufStr != expected { - t.Fatalf("Unexpected decompression output. \"%s\" != \"%s\"", bufStr, expected) + subsetExpected := expected[start : start+size] + if string(decompressedBytes) != subsetExpected { + t.Fatalf("Unexpected decompression output. \"%s\" != \"%s\"", string(decompressedBytes), subsetExpected) } } diff --git a/reader_test.go b/reader_test.go index d1d28f7..8db8c11 100644 --- a/reader_test.go +++ b/reader_test.go @@ -60,16 +60,9 @@ func TestReaderMultipleStreams(t *testing.T) { if err != nil { t.Fatalf("ReadFile error %s", err) } - m := make([]byte, 0, 4*len(data)+4*4) - m = append(m, data...) - m = append(m, data...) - m = append(m, 0, 0, 0, 0) - m = append(m, data...) - m = append(m, 0, 0, 0, 0) - m = append(m, 0, 0, 0, 0) - m = append(m, data...) - m = append(m, 0, 0, 0, 0) - xz := bytes.NewReader(m) + + multiStream := testMultiStreams(data) + xz := bytes.NewReader(multiStream) r, err := NewReader(xz) if err != nil { t.Fatalf("NewReader error %s", err) @@ -80,6 +73,19 @@ func TestReaderMultipleStreams(t *testing.T) { } } +func testMultiStreams(singleStream []byte) []byte { + multiStream := make([]byte, 0, 4*len(singleStream)+4*4) + multiStream = append(multiStream, singleStream...) + multiStream = append(multiStream, singleStream...) + multiStream = append(multiStream, 0, 0, 0, 0) + multiStream = append(multiStream, singleStream...) + multiStream = append(multiStream, 0, 0, 0, 0) + multiStream = append(multiStream, 0, 0, 0, 0) + multiStream = append(multiStream, singleStream...) + multiStream = append(multiStream, 0, 0, 0, 0) + return multiStream +} + func TestCheckNone(t *testing.T) { const file = "testfiles/fox-check-none.xz" xz, err := os.Open(file) From 58793278b51bd8e5767a25431b7988b5115287e8 Mon Sep 17 00:00:00 2001 From: Frederick Robinson Date: Mon, 6 Apr 2020 17:47:28 -0700 Subject: [PATCH 11/14] handle multi-sentence case --- reader_at.go | 52 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/reader_at.go b/reader_at.go index f8ef269..c0f3ea0 100644 --- a/reader_at.go +++ b/reader_at.go @@ -53,12 +53,25 @@ func (c ReaderAtConfig) NewReaderAt(xz io.ReaderAt) (*ReaderAt, error) { xz: xz, } - if err := r.setup(); err != nil { - return nil, err + r.len = r.conf.Len + if r.len < 1 { + panic("todo: implement probing for Len") } - return r, nil + streamEnd := r.len - 1 + + for streamEnd > 0 { + streamStart, err := r.setupIndexAt(streamEnd) + if err != nil { + return nil, fmt.Errorf("trouble creating indices: %v", err) + } + + // the end of the next stream reading backwards is one before the start + // of the one we just processed. + streamEnd = streamStart - 1 + } + return r, nil } // An index carries all the information necessary for reading randomly into a @@ -77,35 +90,32 @@ func (i index) compressedBufferedSize() int64 { return size } -func (r *ReaderAt) setup() error { - r.len = r.conf.Len - if r.len < 1 { - panic("todo: implement probing for Len") - } - +// setupIndexAt takes the offset of the end of a stream, or null bytes following +// the end of a stream. It builds an index for that stream, adds it to the +// beginning of the ReaderAt and returns the offset to the beginning of the stream. +func (r *ReaderAt) setupIndexAt(endOffset int64) (int64, error) { // read backwards past potential null bytes until we find the end of the // footer - end := r.len - 1 - for end > 0 { + for endOffset > 0 { probe := make([]byte, 1) - n, err := r.xz.ReadAt(probe, end) + n, err := r.xz.ReadAt(probe, endOffset) if err != nil { - return err + return 0, err } if n != len(probe) { - return fmt.Errorf("read %d bytes", n) + return 0, fmt.Errorf("read %d bytes", n) } if probe[0] != 0 { break } - end-- + endOffset-- } - end++ + endOffset++ - footerOffset := end - footerLen + footerOffset := endOffset - footerLen f, err := readFooter(newRat(r.xz, footerOffset)) if err != nil { - return err + return 0, err } indexStartOffset := footerOffset - f.indexSize @@ -113,7 +123,7 @@ func (r *ReaderAt) setup() error { // readIndexBody assumes the indicator byte has already been read indexRecs, _, err := readIndexBody(newRat(r.xz, indexStartOffset+1)) if err != nil { - return err + return 0, err } ix := index{ @@ -126,13 +136,13 @@ func (r *ReaderAt) setup() error { headerStartOffset := ix.blockStartOffset - HeaderLen err = sh.UnmarshalReader(newRat(r.xz, headerStartOffset)) if err != nil { - return fmt.Errorf("trouble reading stream header at offset %d: %v", headerStartOffset, err) + return 0, fmt.Errorf("trouble reading stream header at offset %d: %v", headerStartOffset, err) } ix.streamHeader = sh xlog.Debugf("xz indices %+v", r.indices) - return nil + return headerStartOffset, nil } func (r *ReaderAt) ReadAt(p []byte, bufferPos int64) (int, error) { From 97881a478803e6b7620a742dbab800729a3f79c4 Mon Sep 17 00:00:00 2001 From: Frederick Robinson Date: Mon, 6 Apr 2020 17:47:43 -0700 Subject: [PATCH 12/14] refactor test --- reader_at_test.go | 52 +++++++++++++++-------------------------------- 1 file changed, 16 insertions(+), 36 deletions(-) diff --git a/reader_at_test.go b/reader_at_test.go index a1e82e7..1bc5f75 100644 --- a/reader_at_test.go +++ b/reader_at_test.go @@ -2,23 +2,27 @@ package xz import ( "bytes" + "io" "io/ioutil" "os" "testing" ) -const expected = "The quick brown fox jumps over the lazy dog.\n" +const foxSentenceConst = "The quick brown fox jumps over the lazy dog.\n" func TestReaderAtBlocks(t *testing.T) { - testFile(t, "testfiles/fox.blocks.xz", expected) + f, fileSize := testOpenFile(t, "testfiles/fox.blocks.xz") + testFilePart(t, f, fileSize, foxSentenceConst, 0, len(foxSentenceConst)) } func TestReaderAtSimple(t *testing.T) { - testFile(t, "testfiles/fox.xz", expected) + f, fileSize := testOpenFile(t, "testfiles/fox.xz") + testFilePart(t, f, fileSize, foxSentenceConst, 0, 10) } func TestReaderAtMS(t *testing.T) { - expect := expected + expected + expected + expected + expect := foxSentenceConst + foxSentenceConst + foxSentenceConst + foxSentenceConst + filePath := "testfiles/fox.blocks.xz" f, _ := testOpenFile(t, filePath) @@ -29,23 +33,8 @@ func TestReaderAtMS(t *testing.T) { msBytes := testMultiStreams(fData) msB := bytes.NewReader(msBytes) - conf := ReaderAtConfig{ - Len: int64(len(msBytes)), - } - r, err := conf.NewReaderAt(msB) - if err != nil { - t.Fatalf("NewReaderAt error %s", err) - } - - reader := newRat(r, 0) - decompressedBytes, err := ioutil.ReadAll(reader) - if err != nil { - t.Fatalf("io.Copy error %s", err) - } - - if string(decompressedBytes) != expect { - t.Fatalf("Unexpected decompression output for reader %+v. \"%s\" != \"%s\"", r, string(decompressedBytes), expect) - } + start := len(foxSentenceConst) + testFilePart(t, msB, int64(len(msBytes)), expect, start, len(expect)-start) } func testOpenFile(t *testing.T, filePath string) (*os.File, int64) { @@ -62,17 +51,7 @@ func testOpenFile(t *testing.T, filePath string) (*os.File, int64) { return xz, info.Size() } -func testFile(t *testing.T, filePath string, expected string) { - for i := 0; i < len(expected); i++ { - for n := 1; n+i < len(expected); n++ { - testFilePart(t, filePath, expected, i, n) - } - } -} - -func testFilePart(t *testing.T, filePath string, expected string, start, size int) { - f, fileSize := testOpenFile(t, filePath) - +func testFilePart(t *testing.T, f io.ReaderAt, fileSize int64, expected string, start, size int) { conf := ReaderAtConfig{ Len: fileSize, } @@ -83,15 +62,16 @@ func testFilePart(t *testing.T, filePath string, expected string, start, size in decompressedBytes := make([]byte, size) n, err := r.ReadAt(decompressedBytes, int64(start)) + if err != nil { + t.Fatalf("error while reading at: %v", err) + } if n != len(decompressedBytes) { t.Fatalf("unexpectedly didn't read all") } - if err != nil { - t.Fatalf("io.Copy error %s", err) - } subsetExpected := expected[start : start+size] if string(decompressedBytes) != subsetExpected { - t.Fatalf("Unexpected decompression output. \"%s\" != \"%s\"", string(decompressedBytes), subsetExpected) + t.Fatalf("Unexpected decompression output. \"%s\" != \"%s\"", + string(decompressedBytes), subsetExpected) } } From 3d0594991ec202bfa36b9cfd906be6b710d6f48b Mon Sep 17 00:00:00 2001 From: Frederick Robinson Date: Mon, 6 Apr 2020 22:23:16 -0700 Subject: [PATCH 13/14] add benchmark --- reader_at_test.go | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/reader_at_test.go b/reader_at_test.go index 1bc5f75..e2f3193 100644 --- a/reader_at_test.go +++ b/reader_at_test.go @@ -15,6 +15,15 @@ func TestReaderAtBlocks(t *testing.T) { testFilePart(t, f, fileSize, foxSentenceConst, 0, len(foxSentenceConst)) } +func BenchmarkBlocks(b *testing.B) { + f, fileSize := testOpenFile(b, "testfiles/fox.blocks.xz") + + b.ResetTimer() + for i := 0; i < b.N; i++ { + testFilePart(b, f, fileSize, foxSentenceConst, 0, len(foxSentenceConst)) + } +} + func TestReaderAtSimple(t *testing.T) { f, fileSize := testOpenFile(t, "testfiles/fox.xz") testFilePart(t, f, fileSize, foxSentenceConst, 0, 10) @@ -37,7 +46,7 @@ func TestReaderAtMS(t *testing.T) { testFilePart(t, msB, int64(len(msBytes)), expect, start, len(expect)-start) } -func testOpenFile(t *testing.T, filePath string) (*os.File, int64) { +func testOpenFile(t testing.TB, filePath string) (*os.File, int64) { xz, err := os.Open(filePath) if err != nil { t.Fatalf("os.Open(%q) error %s", filePath, err) @@ -51,7 +60,7 @@ func testOpenFile(t *testing.T, filePath string) (*os.File, int64) { return xz, info.Size() } -func testFilePart(t *testing.T, f io.ReaderAt, fileSize int64, expected string, start, size int) { +func testFilePart(t testing.TB, f io.ReaderAt, fileSize int64, expected string, start, size int) { conf := ReaderAtConfig{ Len: fileSize, } From 927ef4b012bf29a5fca05077d8bcd2c94b3e6aec Mon Sep 17 00:00:00 2001 From: frederickrobinson Date: Sat, 11 Apr 2020 14:44:54 -0700 Subject: [PATCH 14/14] add Size() method to access decompressed size --- reader_at.go | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/reader_at.go b/reader_at.go index c0f3ea0..25a86a8 100644 --- a/reader_at.go +++ b/reader_at.go @@ -90,6 +90,14 @@ func (i index) compressedBufferedSize() int64 { return size } +func (i index) uncompressedSize() int64 { + size := int64(0) + for _, r := range i.records { + size += r.uncompressedSize + } + return size +} + // setupIndexAt takes the offset of the end of a stream, or null bytes following // the end of a stream. It builds an index for that stream, adds it to the // beginning of the ReaderAt and returns the offset to the beginning of the stream. @@ -145,6 +153,14 @@ func (r *ReaderAt) setupIndexAt(endOffset int64) (int64, error) { return headerStartOffset, nil } +func (r *ReaderAt) Size() int64 { + total := int64(0) + for _, ix := range r.indices { + total += ix.uncompressedSize() + } + return total +} + func (r *ReaderAt) ReadAt(p []byte, bufferPos int64) (int, error) { lenRequested := len(p)