From ab120f0023f84fc585741ab65ea93b3a308e2b51 Mon Sep 17 00:00:00 2001
From: Matthew Holt <mholt@users.noreply.github.com>
Date: Thu, 21 Nov 2024 15:11:04 -0700
Subject: [PATCH] Refactor Archive=>CompressedArchive; implement DeepFS

I don't think an Archive type is necessary, and to my surprise, an optional embedded interface field (Compression) is always non-nil, but will panic if trying to use it unless it is set. Hence we now have CompressedArchive again, where Compression is required.

DeepFS is a uniquely useful type as well, allowing one to traverse the file system including archive files (and compressed archive files!) as if they were part of the file system. But it probably a terrible thing to do. :)
---
 7z.go         |   3 +-
 brotli.go     |   1 +
 bz2.go        |   1 +
 formats.go    | 137 +++++++++++++++++----------------
 fs.go         | 204 +++++++++++++++++++++++++++++++++++++++++++++++++-
 gz.go         |   1 +
 interfaces.go |  25 ++++---
 lz4.go        |   1 +
 lzip.go       |   1 +
 rar.go        |   1 +
 sz.go         |   3 +-
 tar.go        |   1 +
 xz.go         |   1 +
 zip.go        |   3 +-
 zlib.go       |   1 +
 zstd.go       |   1 +
 16 files changed, 306 insertions(+), 79 deletions(-)

diff --git a/7z.go b/7z.go
index e98d12e..6ba4bac 100644
--- a/7z.go
+++ b/7z.go
@@ -31,7 +31,8 @@ type SevenZip struct {
 	Password string
 }
 
-func (z SevenZip) Extension() string { return ".7z" }
+func (SevenZip) Extension() string { return ".7z" }
+func (SevenZip) MediaType() string { return "application/x-7z-compressed" }
 
 func (z SevenZip) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
 	var mr MatchResult
diff --git a/brotli.go b/brotli.go
index c9968fa..e6d09ab 100644
--- a/brotli.go
+++ b/brotli.go
@@ -18,6 +18,7 @@ type Brotli struct {
 }
 
 func (Brotli) Extension() string { return ".br" }
+func (Brotli) MediaType() string { return "application/x-br" }
 
 func (br Brotli) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
 	var mr MatchResult
diff --git a/bz2.go b/bz2.go
index 40780a8..ff7bb3d 100644
--- a/bz2.go
+++ b/bz2.go
@@ -19,6 +19,7 @@ type Bz2 struct {
 }
 
 func (Bz2) Extension() string { return ".bz2" }
+func (Bz2) MediaType() string { return "application/x-bzip2" }
 
 func (bz Bz2) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
 	var mr MatchResult
diff --git a/formats.go b/formats.go
index f7c8873..597dd48 100644
--- a/formats.go
+++ b/formats.go
@@ -6,6 +6,8 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"path"
+	"path/filepath"
 	"strings"
 )
 
@@ -25,25 +27,23 @@ func RegisterFormat(format Format) {
 // compressed archive files (tar.gz, tar.bz2...). The returned Format
 // value can be type-asserted to ascertain its capabilities.
 //
-// If no matching formats were found, special error ErrNoMatch is returned.
+// If no matching formats were found, special error NoMatch is returned.
 //
 // If stream is nil then it will only match on file name and the
 // returned io.Reader will be nil.
 //
-// If stream is non-nil then the returned io.Reader will always be
-// non-nil and will read from the same point as the reader which was
-// passed in. If the input stream is not an io.Seeker, the returned
-// io.Reader value should be used in place of the input stream after
-// calling Identify() because it preserves and re-reads the bytes that
-// were already read during the identification process.
-//
-// If the input stream is an io.Seeker, Seek() must work, and the
-// original input value will be returned instead of a wrapper value.
+// If stream is non-nil, it will be returned in the same read position
+// as it was before Identify() was called, by virtue of buffering the
+// peeked bytes. However, if the stream is an io.Seeker, Seek() must
+// work, no extra buffering will be performed, and the original input
+// value will be returned at the original position by seeking.
 func Identify(ctx context.Context, filename string, stream io.Reader) (Format, io.Reader, error) {
 	var compression Compression
 	var archival Archival
 	var extraction Extraction
 
+	filename = path.Base(filepath.ToSlash(filename))
+
 	rewindableStream, err := newRewindReader(stream)
 	if err != nil {
 		return nil, nil, err
@@ -69,7 +69,7 @@ func Identify(ctx context.Context, filename string, stream io.Reader) (Format, i
 		}
 	}
 
-	// try archival and extraction format next
+	// try archival and extraction formats next
 	for name, format := range formats {
 		ar, isArchive := format.(Archival)
 		ex, isExtract := format.(Extraction)
@@ -98,8 +98,14 @@ func Identify(ctx context.Context, filename string, stream io.Reader) (Format, i
 		return archival, bufferedStream, nil
 	case compression == nil && archival == nil && extraction != nil:
 		return extraction, bufferedStream, nil
-	case archival != nil || extraction != nil:
-		return Archive{compression, archival, extraction}, bufferedStream, nil
+	case compression == nil && archival != nil && extraction != nil:
+		// archival and extraction are always set together, so they must be the same
+		return archival, bufferedStream, nil
+	case compression != nil && extraction != nil:
+		// in practice, this is only used for compressed tar files, and the tar format can
+		// both read and write, so the archival value should always work too; but keep in
+		// mind that Identify() is used on existing files to be read, not new files to write
+		return CompressedArchive{archival, extraction, compression}, bufferedStream, nil
 	default:
 		return nil, bufferedStream, NoMatch
 	}
@@ -166,44 +172,43 @@ func readAtMost(stream io.Reader, n int) ([]byte, error) {
 	return nil, err
 }
 
-// Archive represents an archive which may be compressed at the outer layer.
-// It combines a compression format on top of an archive/extraction
-// format (e.g. ".tar.gz") and provides both functionalities in a single
-// type. It ensures that archival functions are wrapped by compressors and
-// decompressors. However, compressed archives have some limitations; for
-// example, files cannot be inserted/appended because of complexities with
-// modifying existing compression state (perhaps this could be overcome,
-// but I'm not about to try it).
-//
-// The embedded Archival and Extraction values are used for writing and
-// reading, respectively. Compression is optional and is only needed if the
-// format is compressed externally (for example, tar archives).
-type Archive struct {
-	Compression
+// CompressedArchive represents an archive which is compressed externally
+// (for example, a gzipped tar file, .tar.gz.) It combines a compression
+// format on top of an archival/extraction format and provides both
+// functionalities in a single type, allowing archival and extraction
+// operations transparently through compression and decompression. However,
+// compressed archives have some limitations; for example, files cannot be
+// inserted/appended because of complexities with modifying existing
+// compression state (perhaps this could be overcome, but I'm not about to
+// try it).
+type CompressedArchive struct {
 	Archival
 	Extraction
+	Compression
 }
 
 // Name returns a concatenation of the archive and compression format extensions.
-func (ar Archive) Extension() string {
+func (ca CompressedArchive) Extension() string {
 	var name string
-	if ar.Archival != nil {
-		name += ar.Archival.Extension()
-	} else if ar.Extraction != nil {
-		name += ar.Extraction.Extension()
-	}
-	if ar.Compression != nil {
-		name += ar.Compression.Extension()
+	if ca.Archival != nil {
+		name += ca.Archival.Extension()
+	} else if ca.Extraction != nil {
+		name += ca.Extraction.Extension()
 	}
+	name += ca.Compression.Extension()
 	return name
 }
 
+// MediaType returns the compression format's MIME type, since
+// a compressed archive is fundamentally a compressed file.
+func (ca CompressedArchive) MediaType() string { return ca.Compression.MediaType() }
+
 // Match matches if the input matches both the compression and archival/extraction format.
-func (ar Archive) Match(ctx context.Context, filename string, stream io.Reader) (MatchResult, error) {
+func (ca CompressedArchive) Match(ctx context.Context, filename string, stream io.Reader) (MatchResult, error) {
 	var conglomerate MatchResult
 
-	if ar.Compression != nil {
-		matchResult, err := ar.Compression.Match(ctx, filename, stream)
+	if ca.Compression != nil {
+		matchResult, err := ca.Compression.Match(ctx, filename, stream)
 		if err != nil {
 			return MatchResult{}, err
 		}
@@ -213,7 +218,7 @@ func (ar Archive) Match(ctx context.Context, filename string, stream io.Reader)
 
 		// wrap the reader with the decompressor so we can
 		// attempt to match the archive by reading the stream
-		rc, err := ar.Compression.OpenReader(stream)
+		rc, err := ca.Compression.OpenReader(stream)
 		if err != nil {
 			return matchResult, err
 		}
@@ -223,8 +228,8 @@ func (ar Archive) Match(ctx context.Context, filename string, stream io.Reader)
 		conglomerate = matchResult
 	}
 
-	if ar.Archival != nil {
-		matchResult, err := ar.Archival.Match(ctx, filename, stream)
+	if ca.Archival != nil {
+		matchResult, err := ca.Archival.Match(ctx, filename, stream)
 		if err != nil {
 			return MatchResult{}, err
 		}
@@ -238,33 +243,33 @@ func (ar Archive) Match(ctx context.Context, filename string, stream io.Reader)
 	return conglomerate, nil
 }
 
-// Archive adds files to the output archive while compressing the result.
-func (ar Archive) Archive(ctx context.Context, output io.Writer, files []FileInfo) error {
-	if ar.Archival == nil {
+// Archive writes an archive to the output stream while compressing the result.
+func (ca CompressedArchive) Archive(ctx context.Context, output io.Writer, files []FileInfo) error {
+	if ca.Archival == nil {
 		return fmt.Errorf("no archival format")
 	}
-	if ar.Compression != nil {
-		wc, err := ar.Compression.OpenWriter(output)
+	if ca.Compression != nil {
+		wc, err := ca.Compression.OpenWriter(output)
 		if err != nil {
 			return err
 		}
 		defer wc.Close()
 		output = wc
 	}
-	return ar.Archival.Archive(ctx, output, files)
+	return ca.Archival.Archive(ctx, output, files)
 }
 
 // ArchiveAsync adds files to the output archive while compressing the result asynchronously.
-func (ar Archive) ArchiveAsync(ctx context.Context, output io.Writer, jobs <-chan ArchiveAsyncJob) error {
-	if ar.Archival == nil {
+func (ca CompressedArchive) ArchiveAsync(ctx context.Context, output io.Writer, jobs <-chan ArchiveAsyncJob) error {
+	if ca.Archival == nil {
 		return fmt.Errorf("no archival format")
 	}
-	do, ok := ar.Archival.(ArchiverAsync)
+	do, ok := ca.Archival.(ArchiverAsync)
 	if !ok {
-		return fmt.Errorf("%T archive does not support async writing", ar.Archival)
+		return fmt.Errorf("%T archive does not support async writing", ca.Archival)
 	}
-	if ar.Compression != nil {
-		wc, err := ar.Compression.OpenWriter(output)
+	if ca.Compression != nil {
+		wc, err := ca.Compression.OpenWriter(output)
 		if err != nil {
 			return err
 		}
@@ -274,20 +279,20 @@ func (ar Archive) ArchiveAsync(ctx context.Context, output io.Writer, jobs <-cha
 	return do.ArchiveAsync(ctx, output, jobs)
 }
 
-// Extract reads files out of an archive while decompressing the results.
-func (ar Archive) Extract(ctx context.Context, sourceArchive io.Reader, handleFile FileHandler) error {
-	if ar.Extraction == nil {
+// Extract reads files out of a compressed archive while decompressing the results.
+func (ca CompressedArchive) Extract(ctx context.Context, sourceArchive io.Reader, handleFile FileHandler) error {
+	if ca.Extraction == nil {
 		return fmt.Errorf("no extraction format")
 	}
-	if ar.Compression != nil {
-		rc, err := ar.Compression.OpenReader(sourceArchive)
+	if ca.Compression != nil {
+		rc, err := ca.Compression.OpenReader(sourceArchive)
 		if err != nil {
 			return err
 		}
 		defer rc.Close()
 		sourceArchive = rc
 	}
-	return ar.Extraction.Extract(ctx, sourceArchive, handleFile)
+	return ca.Extraction.Extract(ctx, sourceArchive, handleFile)
 }
 
 // MatchResult returns true if the format was matched either
@@ -303,6 +308,10 @@ type MatchResult struct {
 // Matched returns true if a match was made by either name or stream.
 func (mr MatchResult) Matched() bool { return mr.ByName || mr.ByStream }
 
+func (mr MatchResult) String() string {
+	return fmt.Sprintf("{ByName=%v ByStream=%v}", mr.ByName, mr.ByStream)
+}
+
 // rewindReader is a Reader that can be rewound (reset) to re-read what
 // was already read and then continue to read more from the underlying
 // stream. When no more rewinding is necessary, call reader() to get a
@@ -422,8 +431,10 @@ var formats = make(map[string]Format)
 
 // Interface guards
 var (
-	_ Format        = (*Archive)(nil)
-	_ Archiver      = (*Archive)(nil)
-	_ ArchiverAsync = (*Archive)(nil)
-	_ Extractor     = (*Archive)(nil)
+	_ Format        = (*CompressedArchive)(nil)
+	_ Archiver      = (*CompressedArchive)(nil)
+	_ ArchiverAsync = (*CompressedArchive)(nil)
+	_ Extractor     = (*CompressedArchive)(nil)
+	_ Compressor    = (*CompressedArchive)(nil)
+	_ Decompressor  = (*CompressedArchive)(nil)
 )
diff --git a/fs.go b/fs.go
index 4714e10..7e46948 100644
--- a/fs.go
+++ b/fs.go
@@ -11,6 +11,7 @@ import (
 	"path/filepath"
 	"slices"
 	"strings"
+	"sync"
 	"time"
 )
 
@@ -65,7 +66,7 @@ func FileSystem(ctx context.Context, filename string, stream ReaderAtSeeker) (fs
 
 		// real folders can be accessed easily
 		if info.IsDir() {
-			return os.DirFS(filename), nil
+			return DirFS{os.DirFS(filename)}, nil
 		}
 
 		// if any archive formats recognize this file, access it like a folder
@@ -202,6 +203,36 @@ type compressedFile struct {
 	closeBoth // file and decompressor
 }
 
+// DirFS is returned by FileSystem() if the input is a real directory
+// on disk. It merely wraps the return value of os.DirFS(), which is
+// (unfortunately) unexported, making it impossible to use with type
+// assertions to determine which kind of FS was returned. Because this
+// wrapper type is exported, it can be type-asserted against.
+// If this type is used manually and the embedded type does not
+// implement the same interfaces os.dirFS does, errors will occur.
+type DirFS struct{ fs.FS }
+
+func (d DirFS) ReadFile(name string) ([]byte, error) {
+	if fsys, ok := d.FS.(fs.ReadFileFS); ok {
+		return fsys.ReadFile(name)
+	}
+	return nil, fmt.Errorf("not supported; wrapped type must implement fs.ReadFileFS")
+}
+
+func (d DirFS) ReadDir(name string) ([]fs.DirEntry, error) {
+	if fsys, ok := d.FS.(fs.ReadDirFS); ok {
+		return fsys.ReadDir(name)
+	}
+	return nil, fmt.Errorf("not supported; wrapped type must implement fs.ReadDirFS")
+}
+
+func (d DirFS) Stat(name string) (fs.FileInfo, error) {
+	if fsys, ok := d.FS.(fs.StatFS); ok {
+		return fsys.Stat(name)
+	}
+	return nil, fmt.Errorf("not supported; wrapped type must implement fs.StatFS")
+}
+
 // ArchiveFS allows reading an archive (or a compressed archive) using a
 // consistent file system interface. Essentially, it allows traversal and
 // reading of archive contents the same way as any normal directory on disk.
@@ -350,7 +381,7 @@ func (f ArchiveFS) Open(name string) (fs.File, error) {
 	}
 
 	var decompressor io.ReadCloser
-	if decomp, ok := f.Format.(Decompressor); ok {
+	if decomp, ok := f.Format.(Decompressor); ok && decomp != nil {
 		decompressor, err = decomp.OpenReader(inputStream)
 		if err != nil {
 			return nil, err
@@ -411,7 +442,7 @@ func (f ArchiveFS) Open(name string) (fs.File, error) {
 	// files may have a "." component in them, and the underlying format doesn't
 	// know about our file system semantics, so we need to filter ourselves (it's
 	// not significantly less efficient).
-	if ar, ok := f.Format.(Archive); ok {
+	if ar, ok := f.Format.(CompressedArchive); ok {
 		// bypass the CompressedArchive format's opening of the decompressor, since
 		// we already did it because we need to keep it open after returning.
 		// "I BYPASSED THE COMPRESSOR!" -Rey
@@ -634,6 +665,173 @@ func (f *ArchiveFS) Sub(dir string) (fs.FS, error) {
 	return result, nil
 }
 
+// DeepFS is a fs.FS that represents the real file system, but also has
+// the ability to traverse into archive files as if they were part of the
+// regular file system. If a filename component ends with an archive
+// extension (e.g. .zip, .tar, .tar.gz, etc.), then the remainder of the
+// filepath will be considered to be inside that archive.
+//
+// This allows treating archive files transparently as if they were part
+// of the regular file system during a walk, which can be extremely useful
+// for accessing data in an "ordinary" walk of the disk, without needing to
+// first extract all the archives and use more disk space.
+//
+// The listing of archive entries is retained for the lifetime of the
+// DeepFS value for efficiency, but this can use more memory if archives
+// contain a lot of files.
+type DeepFS struct {
+	// The root filepath on disk.
+	Root string
+
+	// An optional context, mainly for cancellation.
+	Context context.Context
+
+	// remember archive file systems for efficiency
+	inners map[string]fs.FS
+	mu     sync.Mutex
+}
+
+func (fsys *DeepFS) Open(name string) (fs.File, error) {
+	if !fs.ValidPath(name) {
+		return nil, &fs.PathError{Op: "open", Path: name, Err: fmt.Errorf("%w: %s", fs.ErrInvalid, name)}
+	}
+	name = path.Join(fsys.Root, name)
+	realPath, innerPath := fsys.splitPath(name)
+	if innerPath != "" {
+		if innerFsys := fsys.getInnerFsys(realPath); innerFsys != nil {
+			return innerFsys.Open(innerPath)
+		}
+	}
+	return os.Open(realPath)
+}
+
+func (fsys *DeepFS) Stat(name string) (fs.FileInfo, error) {
+	if !fs.ValidPath(name) {
+		return nil, &fs.PathError{Op: "stat", Path: name, Err: fmt.Errorf("%w: %s", fs.ErrInvalid, name)}
+	}
+	name = path.Join(fsys.Root, name)
+	realPath, innerPath := fsys.splitPath(name)
+	if innerPath != "" {
+		if innerFsys := fsys.getInnerFsys(realPath); innerFsys != nil {
+			return fs.Stat(innerFsys, innerPath)
+		}
+	}
+	return os.Stat(realPath)
+}
+
+// ReadDir returns the directory listing for the given directory name,
+// but for any entries that appear by their file extension to be archive
+// files, they are slightly modified to always return true for IsDir(),
+// since we have the unique ability to list the contents of archives as
+// if they were directories.
+func (fsys *DeepFS) ReadDir(name string) ([]fs.DirEntry, error) {
+	if !fs.ValidPath(name) {
+		return nil, &fs.PathError{Op: "readdir", Path: name, Err: fmt.Errorf("%w: %s", fs.ErrInvalid, name)}
+	}
+	name = path.Join(fsys.Root, name)
+	realPath, innerPath := fsys.splitPath(name)
+	if innerPath != "" {
+		if innerFsys := fsys.getInnerFsys(realPath); innerFsys != nil {
+			return fs.ReadDir(innerFsys, innerPath)
+		}
+	}
+	entries, err := os.ReadDir(realPath)
+	if err != nil {
+		return nil, err
+	}
+	// make sure entries that appear to be archive files indicate they are a directory
+	// so the fs package will try to walk them
+	for i, entry := range entries {
+		if slices.Contains(archiveExtensions, path.Ext(entry.Name())) {
+			entries[i] = alwaysDirEntry{entry}
+		}
+	}
+	return entries, nil
+}
+
+// getInnerFsys reuses "inner" file systems, because for example, archives.ArchiveFS
+// amortizes directory entries with the first call to ReadDir; if we don't reuse the
+// file systems then they have to rescan the same archive multiple times.
+func (fsys *DeepFS) getInnerFsys(realPath string) fs.FS {
+	realPath = filepath.Clean(realPath)
+
+	fsys.mu.Lock()
+	defer fsys.mu.Unlock()
+
+	if fsys.inners == nil {
+		fsys.inners = make(map[string]fs.FS)
+	} else if innerFsys, ok := fsys.inners[realPath]; ok {
+		return innerFsys
+	}
+	innerFsys, err := FileSystem(fsys.context(), realPath, nil)
+	if err == nil {
+		fsys.inners[realPath] = innerFsys
+		return innerFsys
+	}
+	return nil
+}
+
+// splitPath splits a file path into the "real" path and the "inner" path components,
+// where the split point is the extension of an archive filetype like ".zip" or ".tar.gz".
+// The real path is the path that can be accessed on disk and will be returned with
+// filepath separators. The inner path is the path that can be used within the archive.
+// If no archive extension is found in the path, only the realPath is returned.
+// If the input path is precisely an archive file (i.e. ends with an archive file
+// extension), then innerPath is returned as "." which indicates the root of the archive.
+func (*DeepFS) splitPath(path string) (realPath, innerPath string) {
+	for _, ext := range archiveExtensions {
+		idx := strings.Index(path+"/", ext+"/")
+		if idx < 0 {
+			continue
+		}
+		splitPos := idx + len(ext)
+		realPath = filepath.Clean(filepath.FromSlash(path[:splitPos]))
+		innerPath = strings.TrimPrefix(path[splitPos:], "/")
+		if innerPath == "" {
+			// signal to the caller that this is an archive,
+			// even though it is the very root of the archive
+			innerPath = "."
+		}
+		return
+	}
+	realPath = filepath.Clean(filepath.FromSlash(path))
+	return
+}
+
+func (fsys *DeepFS) context() context.Context {
+	if fsys.Context != nil {
+		return fsys.Context
+	}
+	return context.Background()
+}
+
+// alwaysDirEntry always returns true for IsDir(). Because
+// DeepFS is able to walk archive files as directories,
+// this is used to trick fs.WalkDir to think they are
+// directories and thus traverse into them.
+type alwaysDirEntry struct {
+	fs.DirEntry
+}
+
+func (alwaysDirEntry) IsDir() bool { return true }
+
+// archiveExtensions contains extensions for popular and supported
+// archive types; sorted by popularity and with respect to some
+// being prefixed by other extensions.
+var archiveExtensions = []string{
+	".zip",
+	".tar",
+	".tgz",
+	".tar.gz",
+	".tar.bz2",
+	".tar.zst",
+	".tar.lz4",
+	".tar.xz",
+	".tar.sz",
+	".tar.s2",
+	".tar.lz",
+}
+
 // TopDirOpen is a special Open() function that may be useful if
 // a file system root was created by extracting an archive.
 //
diff --git a/gz.go b/gz.go
index 8e70e10..adbf1ed 100644
--- a/gz.go
+++ b/gz.go
@@ -31,6 +31,7 @@ type Gz struct {
 }
 
 func (Gz) Extension() string { return ".gz" }
+func (Gz) MediaType() string { return "application/gzip" }
 
 func (gz Gz) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
 	var mr MatchResult
diff --git a/interfaces.go b/interfaces.go
index f1e20b7..f917ca6 100644
--- a/interfaces.go
+++ b/interfaces.go
@@ -12,17 +12,19 @@ type Format interface {
 	// format.
 	Extension() string
 
+	// MediaType returns the MIME type ("content type") of this
+	// format (see RFC 2046).
+	MediaType() string
+
 	// Match returns true if the given name/stream is recognized.
 	// One of the arguments is optional: filename might be empty
-	// if working with an unnamed stream, or stream might be
-	// empty if only working with a filename. The filename should
-	// consist only of the base name, not a path component, and is
-	// typically used for matching by file extension. However,
-	// matching by reading the stream is preferred. Match reads
-	// only as many bytes as needed to determine a match. To
-	// preserve the stream through matching, you should either
-	// buffer what is read by Match, or seek to the last position
-	// before Match was called.
+	// if working with an unnamed stream, or stream might be empty
+	// if only working with a file on disk; but both may also be
+	// specified. The filename should consist only of the base name,
+	// not path components, and is typically used for matching by
+	// file extension. However, matching by reading the stream is
+	// preferred as it is more accurate. Match reads only as many
+	// bytes as needed to determine a match.
 	Match(ctx context.Context, filename string, stream io.Reader) (MatchResult, error)
 }
 
@@ -37,6 +39,7 @@ type Compression interface {
 type Archival interface {
 	Format
 	Archiver
+	Extractor
 }
 
 // Extraction is an archival format that extract from (read) archives.
@@ -69,6 +72,7 @@ type Archiver interface {
 
 // ArchiveAsyncJob contains a File to be archived and a channel that
 // the result of the archiving should be returned on.
+// EXPERIMENTAL: Subject to change or removal.
 type ArchiveAsyncJob struct {
 	File   FileInfo
 	Result chan<- error
@@ -77,6 +81,7 @@ type ArchiveAsyncJob struct {
 // ArchiverAsync is an Archiver that can also create archives
 // asynchronously by pumping files into a channel as they are
 // discovered.
+// EXPERIMENTAL: Subject to change or removal.
 type ArchiverAsync interface {
 	Archiver
 
@@ -102,7 +107,7 @@ type Extractor interface {
 }
 
 // Inserter can insert files into an existing archive.
-// EXPERIMENTAL: This API is subject to change.
+// EXPERIMENTAL: Subject to change.
 type Inserter interface {
 	// Insert inserts the files into archive.
 	//
diff --git a/lz4.go b/lz4.go
index dd92e9c..39fce3c 100644
--- a/lz4.go
+++ b/lz4.go
@@ -19,6 +19,7 @@ type Lz4 struct {
 }
 
 func (Lz4) Extension() string { return ".lz4" }
+func (Lz4) MediaType() string { return "application/x-lz4" }
 
 func (lz Lz4) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
 	var mr MatchResult
diff --git a/lzip.go b/lzip.go
index c3d0dce..fa7fdc1 100644
--- a/lzip.go
+++ b/lzip.go
@@ -18,6 +18,7 @@ func init() {
 type Lzip struct{}
 
 func (Lzip) Extension() string { return ".lz" }
+func (Lzip) MediaType() string { return "application/x-lzip" }
 
 func (lz Lzip) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
 	var mr MatchResult
diff --git a/rar.go b/rar.go
index 1eb6f34..cda389e 100644
--- a/rar.go
+++ b/rar.go
@@ -31,6 +31,7 @@ type Rar struct {
 }
 
 func (Rar) Extension() string { return ".rar" }
+func (Rar) MediaType() string { return "application/vnd.rar" }
 
 func (r Rar) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
 	var mr MatchResult
diff --git a/sz.go b/sz.go
index 34ec3fb..bb23f21 100644
--- a/sz.go
+++ b/sz.go
@@ -44,7 +44,8 @@ type S2 struct {
 	SnappyIncompatible bool
 }
 
-func (sz Sz) Extension() string { return ".sz" }
+func (Sz) Extension() string { return ".sz" }
+func (Sz) MediaType() string { return "application/x-snappy-framed" }
 
 func (sz Sz) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
 	var mr MatchResult
diff --git a/tar.go b/tar.go
index 9e91692..e82ff06 100644
--- a/tar.go
+++ b/tar.go
@@ -27,6 +27,7 @@ type Tar struct {
 }
 
 func (Tar) Extension() string { return ".tar" }
+func (Tar) MediaType() string { return "application/x-tar" }
 
 func (t Tar) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
 	var mr MatchResult
diff --git a/xz.go b/xz.go
index d5e45d5..68905d2 100644
--- a/xz.go
+++ b/xz.go
@@ -18,6 +18,7 @@ func init() {
 type Xz struct{}
 
 func (Xz) Extension() string { return ".xz" }
+func (Xz) MediaType() string { return "application/x-xz" }
 
 func (x Xz) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
 	var mr MatchResult
diff --git a/zip.go b/zip.go
index 826ec54..85623e9 100644
--- a/zip.go
+++ b/zip.go
@@ -77,7 +77,8 @@ type Zip struct {
 	TextEncoding encoding.Encoding
 }
 
-func (z Zip) Extension() string { return ".zip" }
+func (Zip) Extension() string { return ".zip" }
+func (Zip) MediaType() string { return "application/zip" }
 
 func (z Zip) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
 	var mr MatchResult
diff --git a/zlib.go b/zlib.go
index a1998d5..9ee64f4 100644
--- a/zlib.go
+++ b/zlib.go
@@ -18,6 +18,7 @@ type Zlib struct {
 }
 
 func (Zlib) Extension() string { return ".zz" }
+func (Zlib) MediaType() string { return "application/zlib" }
 
 func (zz Zlib) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
 	var mr MatchResult
diff --git a/zstd.go b/zstd.go
index 7677d95..c36c6b9 100644
--- a/zstd.go
+++ b/zstd.go
@@ -20,6 +20,7 @@ type Zstd struct {
 }
 
 func (Zstd) Extension() string { return ".zst" }
+func (Zstd) MediaType() string { return "application/zstd" }
 
 func (zs Zstd) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
 	var mr MatchResult