diff --git a/build.gradle b/build.gradle index a5de0a8..a6094ea 100644 --- a/build.gradle +++ b/build.gradle @@ -17,7 +17,7 @@ sourceCompatibility = 1.8 group = 'edu.mit.lib' archivesBaseName = 'bagit' -version = '0.8' +version = '0.9' description = 'Compact Java BagIt library' ext { @@ -28,7 +28,7 @@ ext { } dependencies { - compile group: 'org.apache.commons', name: 'commons-compress', version: '1.13' + compile group: 'org.apache.commons', name: 'commons-compress', version: '1.15' testCompile group: 'junit', name: 'junit', version: '4.11' } diff --git a/src/main/java/edu/mit/lib/bagit/Bag.java b/src/main/java/edu/mit/lib/bagit/Bag.java index 084b623..9424aec 100644 --- a/src/main/java/edu/mit/lib/bagit/Bag.java +++ b/src/main/java/edu/mit/lib/bagit/Bag.java @@ -8,6 +8,7 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; +import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.nio.file.DirectoryStream; import java.nio.file.Files; @@ -16,10 +17,12 @@ import java.security.DigestInputStream; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; +import java.util.Arrays; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.concurrent.atomic.AtomicBoolean; /** * Bag represents a rudimentary bag conformant to LC Bagit spec - version 0.97. @@ -40,10 +43,9 @@ public class Bag { // coding constants - static final String ENCODING = "UTF-8"; static final String CS_ALGO = "MD5"; static final String BAGIT_VSN = "0.97"; - static final String LIB_VSN = "0.8"; + static final String LIB_VSN = "0.9"; static final String DFLT_FMT = "zip"; static final String TGZIP_FMT = "tgz"; static final String SPACER = " "; @@ -90,6 +92,12 @@ public String getName() { // directory root of bag private final Path baseDir; + // Checksum algorithm used in manifests + private final String csAlgorithm; + + // Character encoding declared for tag files + private final Charset tagEncoding; + // allow serialization, etc of bag private final boolean sealed; @@ -100,9 +108,11 @@ public String getName() { * Constructor - creates a new bag from a Loader * */ - Bag(Path baseDir, boolean sealed) { + Bag(Path baseDir, boolean sealed) throws IOException { this.baseDir = baseDir; this.sealed = sealed; + this.csAlgorithm = csAlgorithm(baseDir); + this.tagEncoding = tagEncoding(baseDir); } /** @@ -139,7 +149,17 @@ public String bagName() { * @throws IOException if default algorithm unknown */ public String csAlgorithm() throws IOException { - return csAlgorithm(baseDir); + return csAlgorithm; + } + + /** + * Returns the character encoding declared for tag files. + * + * @return declared character set encoding + * @throws IOException if encoding missing or unknown + */ + public Charset tagEncoding() throws IOException { + return tagEncoding; } /** @@ -149,18 +169,29 @@ public String csAlgorithm() throws IOException { * @throws IOException if unable to read bag contents */ public boolean isComplete() throws IOException { + return completeStatus() == 0; + } + + /** + * Returns completeness status code. + * + * @return status code for completeness: 0 is complete, negative is error code + * @throws IOException if unable to read bag contents + */ + public int completeStatus() throws IOException { // no fetch.txt? - if (Files.exists(bagFile(REF_FILE))) return false; - // mandatory files present? - if (! (Files.exists(bagFile(DECL_FILE)) && - Files.isDirectory(bagFile(DATA_DIR)))) return false; + if (Files.exists(bagFile(REF_FILE))) return -1; + // mandatory files present and correct? + if (! Files.exists(bagFile(DECL_FILE))) return -2; + if (! Charset.isSupported(tagEncoding.name())) return -3; + if (! Files.isDirectory(bagFile(DATA_DIR))) return -4; // payload files map? Map payloads = payloadManifest(); // # payload files and # manifest entries must agree - if (fileCount(bagFile(DATA_DIR)) != payloads.size()) return false; + if (fileCount(bagFile(DATA_DIR)) != payloads.size()) return -5; // files themselves must match also for (String path : payloads.keySet()) { - if (path.startsWith(DATA_DIR) && Files.notExists(bagFile(path))) return false; + if (path.startsWith(DATA_DIR) && Files.notExists(bagFile(path))) return -6; } // same drill for tag files Map tags = tagManifest(); @@ -178,12 +209,12 @@ public boolean isComplete() throws IOException { else tagCount++; } } - if (tagCount != tags.size()) return false; + if (tagCount != tags.size()) return -7; // files themselves must match also for (String path : tags.keySet()) { - if (Files.notExists(bagFile(path))) return false; + if (Files.notExists(bagFile(path))) return -8; } - return true; + return 0; } /** @@ -202,19 +233,31 @@ public boolean isSealed() { * @throws IOException if unable to read bag contents */ public boolean isValid() throws IOException { - if (! isComplete()) return false; + return validationStatus() == 0; + } + + /** + * Returns validation status code + * + * @return 0 if bag validates, else a negative number + * @throws IOException if unable to read bag contents + */ + public int validationStatus() throws IOException { + int status = completeStatus(); + if (status != 0) return status; // recompute all checksums and compare against manifest values Map payloads = payloadManifest(); for (String relPath : payloads.keySet()) { - String cutPath = relPath.substring(DATA_PATH.length()); - if (! validateFile(payloadStream(cutPath), payloads.get(relPath), csAlgorithm())) return false; + int offset = relPath.startsWith("./") ? 2 : 0; + String cutPath = relPath.substring(DATA_PATH.length() + offset); + if (! validateFile(payloadStream(cutPath), payloads.get(relPath), csAlgorithm)) return -9; } // same for tag files Map tags = tagManifest(); for (String relPath : tags.keySet()) { - if (! validateFile(tagStream(relPath), tags.get(relPath), csAlgorithm())) return false; + if (! validateFile(tagStream(relPath), tags.get(relPath), csAlgorithm)) return -10; } - return true; + return 0; } /** @@ -266,7 +309,7 @@ public InputStream payloadStream(String relPath) throws IOException { * @throws IOException if unable to read refs data */ public Map payloadRefs() throws IOException { - return payloadRefs(bagFile(REF_FILE)); + return payloadRefs(bagFile(REF_FILE), tagEncoding); } /** @@ -347,7 +390,7 @@ public List property(String relPath, String name) throws IOException { if (mdSet == null) { synchronized (mdCache) { mdSet = new HashMap<>(); - try (BufferedReader reader = Files.newBufferedReader(bagFile(relPath), StandardCharsets.UTF_8)) { + try (BufferedReader reader = Files.newBufferedReader(bagFile(relPath), tagEncoding)) { String propName = null; StringBuilder valSb = new StringBuilder(); String line; @@ -382,7 +425,7 @@ public List property(String relPath, String name) throws IOException { * @throws IOException if unable to read manifest */ public Map payloadManifest() throws IOException { - String sfx = csAlgorithm().toLowerCase() + ".txt"; + String sfx = csAlgoName(csAlgorithm) + ".txt"; return manifest(MANIF_FILE + sfx); } @@ -394,7 +437,7 @@ public Map payloadManifest() throws IOException { * @throws IOException if unable to read tag manifest */ public Map tagManifest() throws IOException { - String sfx = csAlgorithm().toLowerCase() + ".txt"; + String sfx = csAlgoName(csAlgorithm) + ".txt"; return manifest(TAGMANIF_FILE + sfx); } @@ -409,10 +452,10 @@ public Map tagManifest() throws IOException { */ public Map manifest(String relPath) throws IOException { Map mfMap = new HashMap<>(); - try (BufferedReader reader = Files.newBufferedReader(bagFile(relPath), StandardCharsets.UTF_8)) { + try (BufferedReader reader = Files.newBufferedReader(bagFile(relPath), tagEncoding)) { String line; while((line = reader.readLine()) != null) { - String[] parts = line.split(" "); + String[] parts = line.split("\\s+", 2); mfMap.put(parts[1], parts[0]); } } @@ -454,15 +497,15 @@ private Path bagFile(String name) { return baseDir.resolve(name); } - private boolean validateFile(InputStream is, String expectedChecksum, String csAlg) throws IOException { - byte[] buf = new byte[2048]; - int num = 0; + private static boolean validateFile(InputStream is, String expectedChecksum, String csAlg) throws IOException { if (is == null) { throw new IOException("no input"); } + byte[] buf = new byte[2048]; + int num = 0; // wrap stream in digest stream try (DigestInputStream dis = - new DigestInputStream(is, MessageDigest.getInstance(csAlg))) { + new DigestInputStream(is, MessageDigest.getInstance(csAlgoCode(csAlg)))) { while (num != -1) { num = dis.read(buf); } @@ -472,13 +515,13 @@ private boolean validateFile(InputStream is, String expectedChecksum, String csA } } - static Map payloadRefs(Path refFile) throws IOException { + static Map payloadRefs(Path refFile, Charset encoding) throws IOException { Map refMap = new HashMap<>(); if (Files.exists(refFile)) { - try (BufferedReader reader = Files.newBufferedReader(refFile, StandardCharsets.UTF_8)) { + try (BufferedReader reader = Files.newBufferedReader(refFile, encoding)) { String line; while((line = reader.readLine()) != null) { - String[] parts = line.split(" "); + String[] parts = line.split("\\s+"); refMap.put(parts[2], parts[0]); } } @@ -499,6 +542,45 @@ static String csAlgorithm(Path base) throws IOException { return null; } + static Charset tagEncoding(Path base) throws IOException { + try (BufferedReader reader = Files.newBufferedReader(base.resolve(DECL_FILE), StandardCharsets.UTF_8)) { + // second line has encoding + reader.readLine(); + String line = reader.readLine(); + if (line != null) { + String[] parts = line.split(":"); + return Charset.forName(parts[1].trim()); + } + } + return Charset.defaultCharset(); + } + + static String csAlgoName(String csAlgol) { + return csAlgol.toLowerCase().replace("-",""); + } + + static String csAlgoCode(String csAlgol) { + String csa = csAlgol.toUpperCase(); + if (csa.startsWith("SHA") && csa.indexOf("-") == -1) { + return "SHA-" + csa.substring(3); + } else { + return csa; + } + } + + static byte[] filterBytes(String data, Charset encoding, AtomicBoolean bomOut) { + byte[] dbytes = data.getBytes(encoding); + if (bomOut.compareAndSet(false, true)) { + return dbytes; + } else if (encoding.equals(StandardCharsets.UTF_16) || + encoding.equals(StandardCharsets.UTF_16BE) || + encoding.equals(StandardCharsets.UTF_16LE)) { + return Arrays.copyOfRange(dbytes, 2, dbytes.length); + } else { + return dbytes; + } + } + private static final char[] HEX_CHARS = "0123456789abcdef".toCharArray(); static String toHex(byte[] data) { if ((data == null) || (data.length == 0)) { diff --git a/src/main/java/edu/mit/lib/bagit/Bagger.java b/src/main/java/edu/mit/lib/bagit/Bagger.java index 02aa118..4ec3aeb 100644 --- a/src/main/java/edu/mit/lib/bagit/Bagger.java +++ b/src/main/java/edu/mit/lib/bagit/Bagger.java @@ -7,6 +7,7 @@ import java.io.IOException; import java.net.URL; +import java.nio.charset.Charset; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; @@ -33,6 +34,7 @@ public class Bagger { private String archFmt = "directory"; private boolean noTime = false; private String csAlg = "MD5"; + private Charset tagEnc = Charset.defaultCharset(); private final List optFlags = new ArrayList<>(); private int verbosityLevel; @@ -51,6 +53,7 @@ public static void main(String[] args) throws IOException, IllegalAccessExceptio case "-n": bagger.noTime = Boolean.valueOf(args[i+1]); break; case "-a": bagger.archFmt = args[i+1]; break; case "-c": bagger.csAlg = args[i+1]; break; + case "-e": bagger.tagEnc = Charset.forName(args[i+1]); break; case "-o": bagger.optFlags.add(args[i+1]); break; case "-v": bagger.verbosityLevel = Integer.parseInt(args[i+1]); break; default: System.out.println("Unknown option: '" + args[i] + "'"); usage(); @@ -85,6 +88,7 @@ public static void usage() { "-a - e.g. 'zip', 'tgz', (default: loose directory)\n" + "-n - 'true' or 'false'\n" + "-c - default: 'MD5'\n" + + "-e - default: 'UTF-8'\n" + "-o \n" + "-v - output level to console (default: 0 = no output)"); System.out.println( @@ -96,7 +100,7 @@ public static void usage() { private Bagger() {} private void fill(Path baseDir) throws IOException { - Filler filler = new Filler(baseDir, csAlg); + Filler filler = new Filler(baseDir, csAlg, tagEnc); if (optFlags.contains("nag")) { filler.noAutoGen(); } @@ -164,19 +168,21 @@ private void plug(Path bagPath) throws IOException { } private void complete(Path bagPath) throws IOException { - boolean complete = new Loader(bagPath).load().isComplete(); + int status = new Loader(bagPath).load().completeStatus(); + boolean complete = status == 0; if (verbosityLevel > 0) { message(bagPath.getFileName().toString(), complete, "complete"); } - System.exit(complete ? 0 : -1); + System.exit(status); } private void validate(Path bagPath) throws IOException { - boolean valid = new Loader(bagPath).load().isValid(); + int status = new Loader(bagPath).load().validationStatus(); + boolean valid = status == 0; if (verbosityLevel > 0) { message(bagPath.getFileName().toString(), valid, "valid"); } - System.exit(valid ? 0 : -1); + System.exit(status); } private void message(String name, boolean ok, String value) { diff --git a/src/main/java/edu/mit/lib/bagit/Filler.java b/src/main/java/edu/mit/lib/bagit/Filler.java index 04498b7..c173df8 100644 --- a/src/main/java/edu/mit/lib/bagit/Filler.java +++ b/src/main/java/edu/mit/lib/bagit/Filler.java @@ -10,6 +10,8 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; @@ -27,6 +29,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; @@ -55,6 +58,8 @@ public class Filler { private Path base; // checksum algorithm private String csAlg; + // Charset encoding for tag files + private Charset tagEncoding; // automatic metadata generation set private Set autogenNames; // total payload size @@ -101,44 +106,60 @@ public enum EolRule { /** * Returns a new Filler (bag builder) instance using * temporary directory to hold bag, default checksum - * algorithm (MD5), and system-defined line separator + * algorithm (MD5), default tag encoding (UTF-8), + * and system-defined line separator * * @throws IOException if error creating bag */ public Filler() throws IOException { - this(null, null, SYSTEM); + this(null, null, null, SYSTEM); } /** * Returns a new Filler (bag builder) instance using passed * directory to hold bag, default checksum algorithm (MD5), - * and system-defined line separator + * default tag encoding (UTF-8), and system-defined line separator * * @param base the base directory in which to construct the bag * @throws IOException if error creating bag */ public Filler(Path base) throws IOException { - this(base, null, SYSTEM); + this(base, null, null, SYSTEM); } /** * Returns a new filler (bag builder) instances using passed directory - * and checksum algorithm, and system-defined line separator + * and checksum algorithm, default tag encoding (UTF-8), + * and system-defined line separator * * @param base directory for bag - if null, create temporary directory * @param csAlgorithm checksum algorithm string - if null use default * @throws IOException if error creating bag */ public Filler(Path base, String csAlgorithm) throws IOException { - this(base, csAlgorithm, SYSTEM); + this(base, csAlgorithm, null, SYSTEM); + } + + /** + * Returns a new filler (bag builder) instances using passed directory + * checksum algorithm, and tag encoding, and system-defined line separator + * + * @param base directory for bag - if null, create temporary directory + * @param csAlgorithm checksum algorithm string - if null use default + * @param encoding tag encoding - if null use default + * @throws IOException if error creating bag + */ + public Filler(Path base, String csAlgorithm, Charset encoding) throws IOException { + this(base, csAlgorithm, encoding, SYSTEM); } /** * Returns a new filler (bag builder) instances using passed directory, - * checksum algorithm, and line separator for text files. + * checksum algorithm, tag encoding, and line separator for text files. * * @param base directory for bag - if null, create temporary directory * @param csAlgorithm checksum algorithm string - if null use default + * @param encoding character encoding to use for tag files - if null use default * @param eolRule line termination rule to use for generated text files. Values are: * SYSTEM - use system-defined line termination * COUNTER_SYSTEM - if on Windows, use Unix EOL, else reverse @@ -147,7 +168,7 @@ public Filler(Path base, String csAlgorithm) throws IOException { * * @throws IOException if error creating bag */ - public Filler(Path base, String csAlgorithm, EolRule eolRule) throws IOException { + public Filler(Path base, String csAlgorithm, Charset encoding, EolRule eolRule) throws IOException { if (base != null) { this.base = base; } else { @@ -155,14 +176,15 @@ public Filler(Path base, String csAlgorithm, EolRule eolRule) throws IOException transientBag = true; } csAlg = (csAlgorithm != null) ? csAlgorithm : CS_ALGO; + tagEncoding = (encoding != null) ? encoding : StandardCharsets.UTF_8; Path dirPath = bagFile(DATA_DIR); if (Files.notExists(dirPath)) { Files.createDirectories(dirPath); } // prepare manifest writers - String sfx = csAlg.toLowerCase() + ".txt"; - tagWriter = new FlatWriter(bagFile(TAGMANIF_FILE + sfx), null, null, false); - manWriter = new FlatWriter(bagFile(MANIF_FILE + sfx), null, tagWriter, true); + String sfx = csAlgoName(csAlg) + ".txt"; + tagWriter = new FlatWriter(bagFile(TAGMANIF_FILE + sfx), null, null, false, tagEncoding); + manWriter = new FlatWriter(bagFile(MANIF_FILE + sfx), null, tagWriter, true, tagEncoding); writers = new HashMap<>(); streams = new HashMap<>(); // set up default auto-generated metadata @@ -213,9 +235,9 @@ private void buildBag() throws IOException { // close the manifest file manWriter.close(); // write out bagit declaration file - FlatWriter fwriter = new FlatWriter(bagFile(DECL_FILE), null, tagWriter, false); + FlatWriter fwriter = new FlatWriter(bagFile(DECL_FILE), null, tagWriter, false, StandardCharsets.UTF_8); fwriter.writeLine("BagIt-Version: " + BAGIT_VSN); - fwriter.writeLine("Tag-File-Character-Encoding: " + ENCODING); + fwriter.writeLine("Tag-File-Character-Encoding: " + tagEncoding.name()); fwriter.close(); // close tag manifest file of previous tag files tagWriter.close(); @@ -295,7 +317,7 @@ private void commitPayload(Path payloadFile, String relPath, InputStream is) thr } // wrap stream in digest stream try (DigestInputStream dis = - new DigestInputStream(is, MessageDigest.getInstance(csAlg))) { + new DigestInputStream(is, MessageDigest.getInstance(csAlgoCode(csAlg)))) { payloadSize += Files.copy(dis, payloadFile); payloadCount++; // record checksum @@ -378,7 +400,7 @@ public Filler tag(String relPath, InputStream is) throws IOException { } // wrap stream in digest stream try (DigestInputStream dis = - new DigestInputStream(is, MessageDigest.getInstance(csAlg))) { + new DigestInputStream(is, MessageDigest.getInstance(csAlgoCode(csAlg)))) { Files.copy(dis, tagFile(relPath)); // record checksum tagWriter.writeLine(toHex(dis.getMessageDigest().digest()) + " " + relPath); @@ -473,7 +495,7 @@ private Path bagFile(String name) { private synchronized FlatWriter getWriter(String name) throws IOException { FlatWriter writer = writers.get(name); if (writer == null) { - writer = new FlatWriter(bagFile(name), null, tagWriter, false); + writer = new FlatWriter(bagFile(name), null, tagWriter, false, tagEncoding); writers.put(name, writer); } return writer; @@ -492,10 +514,13 @@ class FlatWriter extends BagOutputStream { private final List lines = new ArrayList<>(); private final boolean record; + private final Charset encoding; + private final AtomicBoolean bomOut = new AtomicBoolean(); - private FlatWriter(Path file, String brPath, FlatWriter tailWriter, boolean record) throws IOException { + private FlatWriter(Path file, String brPath, FlatWriter tailWriter, boolean record, Charset encoding) throws IOException { super(file, brPath, tailWriter, false); this.record = record; + this.encoding = encoding; } public void writeProperty(String key, String value) throws IOException { @@ -504,7 +529,7 @@ public void writeProperty(String key, String value) throws IOException { while (offset < prop.length()) { int end = Math.min(prop.length() - offset, 80); if (offset > 0) { - write(SPACER.getBytes(ENCODING)); + write(filterBytes(SPACER, encoding, bomOut)); } writeLine(prop.substring(offset, offset + end)); offset += end; @@ -515,8 +540,7 @@ public void writeLine(String line) throws IOException { if (record) { lines.add(line); } - byte[] bytes = (line + lineSeparator).getBytes(ENCODING); - write(bytes); + write(filterBytes(line + lineSeparator, encoding, bomOut)); } public List getLines() { @@ -537,7 +561,7 @@ class BagOutputStream extends OutputStream { private BagOutputStream(Path file, String relPath, FlatWriter tailWriter, boolean isPayload) throws IOException { try { out = Files.newOutputStream(file); - dout = new DigestOutputStream(out, MessageDigest.getInstance(csAlg)); + dout = new DigestOutputStream(out, MessageDigest.getInstance(csAlgoCode(csAlg))); this.relPath = (relPath != null) ? relPath : file.getFileName().toString(); this.tailWriter = tailWriter; this.isPayload = isPayload; diff --git a/src/main/java/edu/mit/lib/bagit/Loader.java b/src/main/java/edu/mit/lib/bagit/Loader.java index 4565ea3..768462d 100644 --- a/src/main/java/edu/mit/lib/bagit/Loader.java +++ b/src/main/java/edu/mit/lib/bagit/Loader.java @@ -9,7 +9,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; -import java.nio.charset.StandardCharsets; +import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.OpenOption; import java.nio.file.Path; @@ -24,6 +24,7 @@ import java.util.Map; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.Scanner; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; @@ -147,48 +148,50 @@ private void finish() throws IOException { // if manWriter is non-null, some payload files were fetched. if (manWriter != null) { manWriter.close(); + Charset encoding = tagEncoding(base); + AtomicBoolean bomOut = new AtomicBoolean(); // Update fetch.txt - remove if all holes plugged, else filter Path refFile = bagFile(REF_FILE); if (payloadRefMap.size() > 0) { // now reconstruct fetch.txt filtering out those resolved try (OutputStream refOut = Files.newOutputStream(refFile)) { - for (String refline : bufferFile(refFile)) { - String[] parts = refline.split(" "); - if (payloadRefMap.containsKey(parts[2])) { - refOut.write(refline.getBytes(ENCODING)); + for (String refline : bufferFile(refFile, encoding)) { + String[] parts = refline.split("\\s+"); + if (payloadRefMap.containsKey(parts[2].trim())) { + refOut.write(filterBytes(refline, encoding, bomOut)); } } } } // update tagmanifest with new manifest checksum, fetch stuff - String sfx = csAlgorithm() + ".txt"; + String sfx = csAlgoName(csAlgorithm()) + ".txt"; Path tagManFile = bagFile(TAGMANIF_FILE + sfx); - List tmLines = bufferFile(tagManFile); // now recompute manifest checksum String manCS = checksum(bagFile(MANIF_FILE + sfx), csAlgorithm()); // likewise fetch.txt if it's still around String fetchCS = Files.exists(refFile) ? checksum(bagFile(MANIF_FILE + sfx), csAlgorithm()) : null; // recreate tagmanifest with new checksums + bomOut.set(false); try (OutputStream tagManOut = Files.newOutputStream(tagManFile)) { - for (String tline : tmLines) { - String[] parts = tline.split(" "); + for (String tline : bufferFile(tagManFile, encoding)) { + String[] parts = tline.split("\\s+"); if (parts[1].startsWith(MANIF_FILE)) { - tagManOut.write((manCS + " " + MANIF_FILE + sfx + lineSeparator).getBytes(ENCODING)); + tagManOut.write(filterBytes(manCS + " " + MANIF_FILE + sfx + lineSeparator, encoding, bomOut)); } else if (parts[1].startsWith(REF_FILE)) { if (fetchCS != null) { - tagManOut.write((fetchCS + " " + REF_FILE + sfx + lineSeparator).getBytes(ENCODING)); + tagManOut.write(filterBytes(fetchCS + " " + REF_FILE + sfx + lineSeparator, encoding, bomOut)); } } else { - tagManOut.write(tline.getBytes(ENCODING)); + tagManOut.write(filterBytes(tline, encoding,bomOut)); } } } } } - private List bufferFile(Path file) throws IOException { + private List bufferFile(Path file, Charset encoding) throws IOException { List lines = new ArrayList<>(); - try (BufferedReader reader = Files.newBufferedReader(file, StandardCharsets.UTF_8)) { + try (BufferedReader reader = Files.newBufferedReader(file, encoding)) { String line; while ((line = reader.readLine()) != null) { lines.add(line + lineSeparator); @@ -208,7 +211,7 @@ public Map payloadRefs() throws IOException { Path refFile = bagFile(REF_FILE); if (payloadRefMap.isEmpty() && Files.exists(refFile)) { // load initial data - payloadRefMap.putAll(Bag.payloadRefs(refFile)); + payloadRefMap.putAll(Bag.payloadRefs(refFile, tagEncoding(base))); } return payloadRefMap; } @@ -229,7 +232,7 @@ public void resolveRef(String relPath, InputStream is) throws IOException { throw new IllegalStateException("Payload file already exists at: " + relPath); } // wrap stream in digest stream - try (DigestInputStream dis = new DigestInputStream(is, MessageDigest.getInstance(csAlgorithm()))) { + try (DigestInputStream dis = new DigestInputStream(is, MessageDigest.getInstance(csAlgoCode(csAlg)))) { Files.copy(dis, bagFile(relPath)); // record checksum manifestWriter().writeLine(toHex(dis.getMessageDigest().digest()) + " " + relPath); @@ -247,24 +250,28 @@ private Path bagFile(String name) { // lazy initialization of manifest writer private synchronized LoaderWriter manifestWriter() throws IOException { if (manWriter == null) { - Path manif = bagFile(MANIF_FILE + csAlgorithm().toLowerCase() + ".txt"); + Path manif = bagFile(MANIF_FILE + csAlgoName(csAlgorithm()) + ".txt"); // set line separator for writer to match existing file encoding try (Scanner sc = new Scanner(manif)) { lineSeparator = (sc.findWithinHorizon("\r\n", 500) != null) ? "\r\n" : "\n"; } - manWriter = new LoaderWriter(manif); + manWriter = new LoaderWriter(manif, tagEncoding(base)); } return manWriter; } class LoaderWriter extends LoaderOutputStream { - private LoaderWriter(Path file) throws IOException { + private final Charset encoding; + private final AtomicBoolean bomOut = new AtomicBoolean(); + + private LoaderWriter(Path file, Charset encoding) throws IOException { super(file); + this.encoding = encoding; } public void writeLine(String line) throws IOException { - write((line + lineSeparator).getBytes(ENCODING)); + write(filterBytes(line + lineSeparator, encoding, bomOut)); } } @@ -280,7 +287,7 @@ private LoaderOutputStream(Path file) throws IOException { OpenOption opt = StandardOpenOption.APPEND; try { out = Files.newOutputStream(file, opt); - dout = new DigestOutputStream(out, MessageDigest.getInstance(csAlg)); + dout = new DigestOutputStream(out, MessageDigest.getInstance(csAlgoCode(csAlg))); this.relPath = file.getFileName().toString(); this.tailWriter = null; } catch (NoSuchAlgorithmException nsae) { @@ -347,7 +354,7 @@ private String checksum(Path file, String csAlg) throws IOException { int num = 0; // wrap stream in digest stream try (InputStream is = Files.newInputStream(file); - DigestInputStream dis = new DigestInputStream(is, MessageDigest.getInstance(csAlg))) { + DigestInputStream dis = new DigestInputStream(is, MessageDigest.getInstance(csAlgoCode(csAlg)))) { while (num != -1) { num = dis.read(buf); } diff --git a/src/test/java/edu/mit/lib/bagit/BagTest.java b/src/test/java/edu/mit/lib/bagit/BagTest.java index 65d3944..e194e6d 100644 --- a/src/test/java/edu/mit/lib/bagit/BagTest.java +++ b/src/test/java/edu/mit/lib/bagit/BagTest.java @@ -7,6 +7,8 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.attribute.BasicFileAttributes; @@ -594,7 +596,7 @@ public void defaultEOLInTextFiles() throws IOException { @Test public void unixEOLInTextFiles() throws IOException { Path bagFile = tempFolder.newFolder("bag33").toPath(); - Filler filler = new Filler(bagFile, "MD5", Filler.EolRule.UNIX); + Filler filler = new Filler(bagFile, "MD5", StandardCharsets.UTF_8, Filler.EolRule.UNIX); OutputStream plout = filler.payloadStream("first.pdf"); for (int i = 0; i < 1000; i++) { plout.write("lskdflsfevmep".getBytes()); @@ -610,7 +612,7 @@ public void unixEOLInTextFiles() throws IOException { @Test public void windowsEOLInTextFiles() throws IOException { Path bagFile = tempFolder.newFolder("bag34").toPath(); - Filler filler = new Filler(bagFile, "MD5", Filler.EolRule.WINDOWS); + Filler filler = new Filler(bagFile, "MD5", StandardCharsets.UTF_8, Filler.EolRule.WINDOWS); OutputStream plout = filler.payloadStream("first.pdf"); for (int i = 0; i < 1000; i++) { plout.write("lskdflsfevmep".getBytes()); @@ -626,7 +628,7 @@ public void windowsEOLInTextFiles() throws IOException { @Test public void counterEOLInTextFiles() throws IOException { Path bagFile = tempFolder.newFolder("bag35").toPath(); - Filler filler = new Filler(bagFile, "MD5", Filler.EolRule.COUNTER_SYSTEM); + Filler filler = new Filler(bagFile, "MD5", StandardCharsets.UTF_8, Filler.EolRule.COUNTER_SYSTEM); OutputStream plout = filler.payloadStream("first.pdf"); for (int i = 0; i < 1000; i++) { plout.write("lskdflsfevmep".getBytes()); @@ -639,6 +641,23 @@ public void counterEOLInTextFiles() throws IOException { assertTrue(! findSeparator(info).equals(System.lineSeparator())); } + @Test + public void validAndInvalidBagUTF16() throws IOException { + Path bagFile = tempFolder.newFolder("bag36").toPath(); + Filler filler = new Filler(bagFile, "MD5", StandardCharsets.UTF_16).payload("first.pdf", payload1); + Bag bag = new Loader(filler.toDirectory()).load(); + Map tman = bag.tagManifest(); + assertTrue(tman.size() == 3); + assertTrue(tman.keySet().contains("bagit.txt")); + assertTrue(tman.keySet().contains("bag-info.txt")); + assertTrue(tman.keySet().contains("manifest-md5.txt")); + assertTrue(bag.isValid()); + // now remove a payload file + Path toDel = bagFile.resolve("data/first.pdf"); + Files.delete(toDel); + assertTrue(!bag.isValid()); + } + private String findSeparator(Path file) throws IOException { try (Scanner scanner = new Scanner(file)) { // it's one or the other