From 8be9ce6262ab48fe58fb58a6919b42a135dee4df Mon Sep 17 00:00:00 2001 From: Richard Rodgers Date: Wed, 2 Dec 2015 18:59:40 -0500 Subject: [PATCH] Adds support for client assignment of line termination for generated text files. Closes #1 --- README.md | 16 ++++- build.gradle | 4 +- src/main/java/edu/mit/lib/bagit/Bag.java | 2 +- src/main/java/edu/mit/lib/bagit/Filler.java | 67 ++++++++++++++++-- src/main/java/edu/mit/lib/bagit/Loader.java | 24 ++++--- src/test/java/edu/mit/lib/bagit/BagTest.java | 72 ++++++++++++++++++++ 6 files changed, 164 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 34feac9..21fe6e0 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,18 @@ Or the bag contents may be obtained from a network stream: For all the API details consult the [Javadoc](http://richardrodgers.github.io/bagit/javadoc/index.html) +## Portability ## + +Bags are intended to be portable data containers, in that one should be able to write them on one operating system, +and read them on another. The spec contemplates this in specific ways, e.g. by allowing text files such as +'bag-info.txt' legally to have _either_ Unix-style line termination, or Windows-style. Tools operating on bags ought +to expect and tolerate this diversity, but do not always. The library provides some assistance here by allowing the user +to specify a preference when creating bags. Thus, if the context of use (lifecycle) for a set of bags is known to be in +a Windows environment, the library can be instructed to use Windows line termination for the generated text files in bags, +even if the bags are being generated on a Unix system. By default, the library will use the termination of the +operating system it is running on ('CR/LF' on Windows, '\n' on Unix and MacOS), but this can be overridden. +See the [Javadoc](http://richardrodgers.github.io/bagit/javadoc/index.html) for details. + ## Archive formats ## Bags are commonly serialized to standard archive formats such as ZIP. The library supports two archive formats: @@ -119,14 +131,14 @@ Fat jars include all dependencies in a single executable jar (no classpath decla The distribution jars are kept at [Bintray](https://bintray.com), so make sure that repository is declared. Then (NB: using the most current version), for Gradle: - compile 'edu.mit.lib:bagit:0.6' + compile 'edu.mit.lib:bagit:0.7' or Maven: edu.mit.lib bagit - 0.6 + 0.7 in a standard pom.xml dependencies block. diff --git a/build.gradle b/build.gradle index 967f97a..0062bf0 100644 --- a/build.gradle +++ b/build.gradle @@ -17,7 +17,7 @@ sourceCompatibility = 1.8 group = 'edu.mit.lib' archivesBaseName = 'bagit' -version = '0.6' +version = '0.7' description = 'Compact Java BagIt library' ext { @@ -44,7 +44,7 @@ task sourcesJar(type: Jar) { from sourceSets.main.allSource } -task javadocJar(type: Jar) { +task javadocJar(type: Jar, dependsOn: javadoc) { classifier = 'javadoc' from javadoc.destinationDir } diff --git a/src/main/java/edu/mit/lib/bagit/Bag.java b/src/main/java/edu/mit/lib/bagit/Bag.java index bf3a9a7..d072d65 100644 --- a/src/main/java/edu/mit/lib/bagit/Bag.java +++ b/src/main/java/edu/mit/lib/bagit/Bag.java @@ -44,7 +44,7 @@ public class Bag { static final String ENCODING = "UTF-8"; static final String CS_ALGO = "MD5"; static final String BAGIT_VSN = "0.97"; - static final String LIB_VSN = "0.6"; + static final String LIB_VSN = "0.7"; static final String DFLT_FMT = "zip"; static final String TGZIP_FMT = "tgz"; static final String SPACER = " "; diff --git a/src/main/java/edu/mit/lib/bagit/Filler.java b/src/main/java/edu/mit/lib/bagit/Filler.java index 07b7d51..449a70d 100644 --- a/src/main/java/edu/mit/lib/bagit/Filler.java +++ b/src/main/java/edu/mit/lib/bagit/Filler.java @@ -41,6 +41,7 @@ import static edu.mit.lib.bagit.Bag.*; import static edu.mit.lib.bagit.Bag.MetadataName.*; +import static edu.mit.lib.bagit.Filler.EolRule.*; /** * Filler is a builder class used to construct bags conformant to LC Bagit spec - version 0.97. @@ -75,38 +76,82 @@ public class Filler { private boolean built; // transient bag? private boolean transientBag; + // line separator used by FlatWriters + private final String lineSeparator; + + /** + * Rule for assigning the EOL (line termination/separation) + */ + public enum EolRule { + /** + * Use system-defined separator + */ + SYSTEM, + /** + * Use Windows separators on Unix systems, + * or vice versa + */ + COUNTER_SYSTEM, + /** + * Use Unix new-line '\n' separators + */ + UNIX, + /** + * Use Windows CR/LF separators + */ + WINDOWS + } /** * Returns a new Filler (bag builder) instance using - * temporary directory to hold bag and default checksum - * algorithm (MD5). + * temporary directory to hold bag, default checksum + * algorithm (MD5), and system-defined line separator * * @throws IOException if error creating bag */ public Filler() throws IOException { - this(null, null); + this(null, null, SYSTEM); } /** * Returns a new Filler (bag builder) instance using passed - * directory to hold bag and default checksum algorithm (MD5). + * directory to hold bag, default checksum algorithm (MD5), + * and system-defined line separator * * @param base the base directory in which to construct the bag * @throws IOException if error creating bag */ public Filler(Path base) throws IOException { - this(base, null); + this(base, null, SYSTEM); } /** * Returns a new filler (bag builder) instances using passed directory - * and checksum algorithm. + * and checksum algorithm, and system-defined line separator * * @param base directory for bag - if null, create temporary directory * @param csAlgorithm checksum algorithm string - if null use default * @throws IOException if error creating bag */ public Filler(Path base, String csAlgorithm) throws IOException { + this(base, csAlgorithm, SYSTEM); + } + + /** + * Returns a new filler (bag builder) instances using passed directory, + * checksum algorithm, and line separator for text files. + * + * @param base directory for bag - if null, create temporary directory + * @param csAlgorithm checksum algorithm string - if null use default + * @param eolRule line termination rule to use for generated text files. Values are: + * SYSTEM - use system-defined line termination + * COUNTER_SYSTEM - if on Windows, use Unix EOL, else reverse + * UNIX - use newline character line termination + * WINDOWS - use CR/LF line termination + * + * @throws IOException if error creating bag + */ + public Filler(Path base, String csAlgorithm, EolRule eolRule) throws IOException { if (base != null) { this.base = base; } else { @@ -130,6 +175,14 @@ public Filler(Path base, String csAlgorithm) throws IOException { autogenNames.add(BAG_SIZE); autogenNames.add(PAYLOAD_OXUM); autogenNames.add(BAG_SOFTWARE_AGENT); + String sysEol = System.lineSeparator(); + switch (eolRule) { + case SYSTEM: lineSeparator = sysEol; break; + case UNIX: lineSeparator = "\n"; break; + case WINDOWS: lineSeparator = "\r\n"; break; + case COUNTER_SYSTEM: lineSeparator = "\n".equals(sysEol) ? "\r\n" : "\n"; break; + default: lineSeparator = sysEol; break; + } } private void buildBag() throws IOException { @@ -468,7 +521,7 @@ public void writeLine(String line) throws IOException { if (record) { lines.add(line); } - byte[] bytes = (line + "\n").getBytes(ENCODING); + byte[] bytes = (line + lineSeparator).getBytes(ENCODING); write(bytes); } diff --git a/src/main/java/edu/mit/lib/bagit/Loader.java b/src/main/java/edu/mit/lib/bagit/Loader.java index d3d1e5b..4f2f221 100644 --- a/src/main/java/edu/mit/lib/bagit/Loader.java +++ b/src/main/java/edu/mit/lib/bagit/Loader.java @@ -25,6 +25,7 @@ import java.util.Map; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ConcurrentHashMap; +import java.util.Scanner; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; @@ -51,6 +52,8 @@ public class Loader { private final ConcurrentMap payloadRefMap = new ConcurrentHashMap<>(); // manifest writer private LoaderWriter manWriter; + // line separator discovered in Bag text file + private String lineSeparator = System.lineSeparator(); /** * Returns a new Loader (bag loader) instance using passed @@ -107,7 +110,7 @@ public Loader(Path parent, InputStream in, String format) throws IOException { } /** - * Returns the checksum algortihm used in bag manifests. + * Returns the checksum algorithm used in bag manifests. * * @return algorithm the checksum algorithm * @throws IOException if error loading bag @@ -147,11 +150,10 @@ private void finish() throws IOException { manWriter.close(); // Update fetch.txt - remove if all holes plugged, else filter Path refFile = bagFile(REF_FILE); - List refLines = bufferFile(refFile); if (payloadRefMap.size() > 0) { // now reconstruct fetch.txt filtering out those resolved try (OutputStream refOut = Files.newOutputStream(refFile)) { - for (String refline : refLines) { + for (String refline : bufferFile(refFile)) { String[] parts = refline.split(" "); if (payloadRefMap.containsKey(parts[2])) { refOut.write(refline.getBytes(ENCODING)); @@ -172,10 +174,10 @@ private void finish() throws IOException { for (String tline : tmLines) { String[] parts = tline.split(" "); if (parts[1].startsWith(MANIF_FILE)) { - tagManOut.write((manCS + " " + MANIF_FILE + sfx + "\n").getBytes(ENCODING)); + tagManOut.write((manCS + " " + MANIF_FILE + sfx + lineSeparator).getBytes(ENCODING)); } else if (parts[1].startsWith(REF_FILE)) { if (fetchCS != null) { - tagManOut.write((fetchCS + " " + REF_FILE + sfx + "\n").getBytes(ENCODING)); + tagManOut.write((fetchCS + " " + REF_FILE + sfx + lineSeparator).getBytes(ENCODING)); } } else { tagManOut.write(tline.getBytes(ENCODING)); @@ -190,7 +192,7 @@ private List bufferFile(Path file) throws IOException { try (BufferedReader reader = Files.newBufferedReader(file, StandardCharsets.UTF_8)) { String line = null; while ((line = reader.readLine()) != null) { - lines.add(line + "\n"); + lines.add(line + lineSeparator); } } Files.delete(file); @@ -257,8 +259,12 @@ private Path dataFile(String name) throws IOException { // lazy initialization of manifest writer private synchronized LoaderWriter manifestWriter() throws IOException { if (manWriter == null) { - String sfx = csAlgorithm().toLowerCase() + ".txt"; - manWriter = new LoaderWriter(bagFile(MANIF_FILE + sfx), null, true, null); + Path manif = bagFile(MANIF_FILE + csAlgorithm().toLowerCase() + ".txt"); + // set line separator for writer to match existing file encoding + try (Scanner sc = new Scanner(manif)) { + lineSeparator = (sc.findWithinHorizon("\r\n", 500) != null) ? "\r\n" : "\n"; + } + manWriter = new LoaderWriter(manif, null, true, null); } return manWriter; } @@ -270,7 +276,7 @@ private LoaderWriter(Path file, String brPath, boolean append, LoaderWriter tail } public void writeLine(String line) throws IOException { - write((line + "\n").getBytes(ENCODING)); + write((line + lineSeparator).getBytes(ENCODING)); } } diff --git a/src/test/java/edu/mit/lib/bagit/BagTest.java b/src/test/java/edu/mit/lib/bagit/BagTest.java index a097d11..677b626 100644 --- a/src/test/java/edu/mit/lib/bagit/BagTest.java +++ b/src/test/java/edu/mit/lib/bagit/BagTest.java @@ -31,6 +31,7 @@ import static edu.mit.lib.bagit.Bag.*; import static edu.mit.lib.bagit.Bag.MetadataName.*; +import static edu.mit.lib.bagit.Filler.EolRule.*; /* * Basic unit tests for BagIt Library. Incomplete. @@ -578,6 +579,77 @@ public void loadedFromStreamValid() throws IOException { assertTrue(loadedBag.isValid()); } + @Test + public void defaultEOLInTextFiles() throws IOException { + Path bagFile = tempFolder.newFolder("bag32").toPath(); + Filler filler = new Filler(bagFile); + OutputStream plout = filler.payloadStream("first.pdf"); + for (int i = 0; i < 1000; i++) { + plout.write("lskdflsfevmep".getBytes()); + } + Path fullBag = filler.toDirectory(); + // use bag-info.txt as representative text-file + Path info = fullBag.resolve("bag-info.txt"); + // line ending is same as system-defined one + assertTrue(new Loader(fullBag).load().isValid()); + assertTrue(findSeparator(info).equals(System.lineSeparator())); + } + + @Test + public void unixEOLInTextFiles() throws IOException { + Path bagFile = tempFolder.newFolder("bag33").toPath(); + Filler filler = new Filler(bagFile, "MD5", Filler.EolRule.UNIX); + OutputStream plout = filler.payloadStream("first.pdf"); + for (int i = 0; i < 1000; i++) { + plout.write("lskdflsfevmep".getBytes()); + } + Path fullBag = filler.toDirectory(); + // use bag-info.txt as representative text-file + Path info = fullBag.resolve("bag-info.txt"); + // line ending is same as system-defined one + assertTrue(new Loader(fullBag).load().isValid()); + assertTrue(findSeparator(info).equals("\n")); + } + + @Test + public void windowsEOLInTextFiles() throws IOException { + Path bagFile = tempFolder.newFolder("bag34").toPath(); + Filler filler = new Filler(bagFile, "MD5", Filler.EolRule.WINDOWS); + OutputStream plout = filler.payloadStream("first.pdf"); + for (int i = 0; i < 1000; i++) { + plout.write("lskdflsfevmep".getBytes()); + } + Path fullBag = filler.toDirectory(); + // use bag-info.txt as representative text-file + Path info = fullBag.resolve("bag-info.txt"); + // line ending is same as system-defined one + assertTrue(new Loader(fullBag).load().isValid()); + assertTrue(findSeparator(info).equals("\r\n")); + } + + @Test + public void counterEOLInTextFiles() throws IOException { + Path bagFile = tempFolder.newFolder("bag35").toPath(); + Filler filler = new Filler(bagFile, "MD5", Filler.EolRule.COUNTER_SYSTEM); + OutputStream plout = filler.payloadStream("first.pdf"); + for (int i = 0; i < 1000; i++) { + plout.write("lskdflsfevmep".getBytes()); + } + Path fullBag = filler.toDirectory(); + // use bag-info.txt as representative text-file + Path info = fullBag.resolve("bag-info.txt"); + // line ending is not the same as system-defined one + assertTrue(new Loader(fullBag).load().isValid()); + assertTrue(! findSeparator(info).equals(System.lineSeparator())); + } + + private String findSeparator(Path file) throws IOException { + try (Scanner scanner = new Scanner(file)) { + // it's one or the other + return (scanner.findWithinHorizon("\r\n", 500) != null) ? "\r\n" : "\n"; + } + } + private int lineCount(Path file) throws IOException { Scanner scanner = new Scanner(file); int count = 0;