Skip to content

Commit

Permalink
Merge pull request #10 from richardrodgers/1_handle_eol
Browse files Browse the repository at this point in the history
Adds support for client assignment of line termination for generated …
  • Loading branch information
Richard Rodgers committed Apr 29, 2016
2 parents 879ba20 + 8be9ce6 commit 006ca44
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 21 deletions.
16 changes: 14 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,18 @@ Or the bag contents may be obtained from a network stream:

For all the API details consult the [Javadoc](http://richardrodgers.github.io/bagit/javadoc/index.html)

## Portability ##

Bags are intended to be portable data containers, in that one should be able to write them on one operating system,
and read them on another. The spec contemplates this in specific ways, e.g. by allowing text files such as
'bag-info.txt' legally to have _either_ Unix-style line termination, or Windows-style. Tools operating on bags ought
to expect and tolerate this diversity, but do not always. The library provides some assistance here by allowing the user
to specify a preference when creating bags. Thus, if the context of use (lifecycle) for a set of bags is known to be in
a Windows environment, the library can be instructed to use Windows line termination for the generated text files in bags,
even if the bags are being generated on a Unix system. By default, the library will use the termination of the
operating system it is running on ('CR/LF' on Windows, '\n' on Unix and MacOS), but this can be overridden.
See the [Javadoc](http://richardrodgers.github.io/bagit/javadoc/index.html) for details.

## Archive formats ##

Bags are commonly serialized to standard archive formats such as ZIP. The library supports two archive formats:
Expand Down Expand Up @@ -119,14 +131,14 @@ Fat jars include all dependencies in a single executable jar (no classpath decla
The distribution jars are kept at [Bintray](https://bintray.com), so make sure that repository is declared.
Then (NB: using the most current version), for Gradle:

compile 'edu.mit.lib:bagit:0.6'
compile 'edu.mit.lib:bagit:0.7'

or Maven:

<dependency>
<groupId>edu.mit.lib</groupId>
<artifactId>bagit</artifactId>
<version>0.6</version>
<version>0.7</version>
</dependency>

in a standard pom.xml dependencies block.
4 changes: 2 additions & 2 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ sourceCompatibility = 1.8

group = 'edu.mit.lib'
archivesBaseName = 'bagit'
version = '0.6'
version = '0.7'
description = 'Compact Java BagIt library'

ext {
Expand All @@ -44,7 +44,7 @@ task sourcesJar(type: Jar) {
from sourceSets.main.allSource
}

task javadocJar(type: Jar) {
task javadocJar(type: Jar, dependsOn: javadoc) {
classifier = 'javadoc'
from javadoc.destinationDir
}
2 changes: 1 addition & 1 deletion src/main/java/edu/mit/lib/bagit/Bag.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ public class Bag {
static final String ENCODING = "UTF-8";
static final String CS_ALGO = "MD5";
static final String BAGIT_VSN = "0.97";
static final String LIB_VSN = "0.6";
static final String LIB_VSN = "0.7";
static final String DFLT_FMT = "zip";
static final String TGZIP_FMT = "tgz";
static final String SPACER = " ";
Expand Down
67 changes: 60 additions & 7 deletions src/main/java/edu/mit/lib/bagit/Filler.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@

import static edu.mit.lib.bagit.Bag.*;
import static edu.mit.lib.bagit.Bag.MetadataName.*;
import static edu.mit.lib.bagit.Filler.EolRule.*;

/**
* Filler is a builder class used to construct bags conformant to LC Bagit spec - version 0.97.
Expand Down Expand Up @@ -75,38 +76,82 @@ public class Filler {
private boolean built;
// transient bag?
private boolean transientBag;
// line separator used by FlatWriters
private final String lineSeparator;

/**
* Rule for assigning the EOL (line termination/separation)
*/
public enum EolRule {
/**
* Use system-defined separator
*/
SYSTEM,
/**
* Use Windows separators on Unix systems,
* or vice versa
*/
COUNTER_SYSTEM,
/**
* Use Unix new-line '\n' separators
*/
UNIX,
/**
* Use Windows CR/LF separators
*/
WINDOWS
}

/**
* Returns a new Filler (bag builder) instance using
* temporary directory to hold bag and default checksum
* algorithm (MD5).
* temporary directory to hold bag, default checksum
* algorithm (MD5), and system-defined line separator
*
* @throws IOException if error creating bag
*/
public Filler() throws IOException {
this(null, null);
this(null, null, SYSTEM);
}

/**
* Returns a new Filler (bag builder) instance using passed
* directory to hold bag and default checksum algorithm (MD5).
* directory to hold bag, default checksum algorithm (MD5),
* and system-defined line separator
*
* @param base the base directory in which to construct the bag
* @throws IOException if error creating bag
*/
public Filler(Path base) throws IOException {
this(base, null);
this(base, null, SYSTEM);
}

/**
* Returns a new filler (bag builder) instances using passed directory
* and checksum algorithm.
* and checksum algorithm, and system-defined line separator
*
* @param base directory for bag - if null, create temporary directory
* @param csAlgorithm checksum algorithm string - if null use default
* @throws IOException if error creating bag
*/
public Filler(Path base, String csAlgorithm) throws IOException {
this(base, csAlgorithm, SYSTEM);
}

/**
* Returns a new filler (bag builder) instances using passed directory,
* checksum algorithm, and line separator for text files.
*
* @param base directory for bag - if null, create temporary directory
* @param csAlgorithm checksum algorithm string - if null use default
* @param eolRule line termination rule to use for generated text files. Values are:
* SYSTEM - use system-defined line termination
* COUNTER_SYSTEM - if on Windows, use Unix EOL, else reverse
* UNIX - use newline character line termination
* WINDOWS - use CR/LF line termination
*
* @throws IOException if error creating bag
*/
public Filler(Path base, String csAlgorithm, EolRule eolRule) throws IOException {
if (base != null) {
this.base = base;
} else {
Expand All @@ -130,6 +175,14 @@ public Filler(Path base, String csAlgorithm) throws IOException {
autogenNames.add(BAG_SIZE);
autogenNames.add(PAYLOAD_OXUM);
autogenNames.add(BAG_SOFTWARE_AGENT);
String sysEol = System.lineSeparator();
switch (eolRule) {
case SYSTEM: lineSeparator = sysEol; break;
case UNIX: lineSeparator = "\n"; break;
case WINDOWS: lineSeparator = "\r\n"; break;
case COUNTER_SYSTEM: lineSeparator = "\n".equals(sysEol) ? "\r\n" : "\n"; break;
default: lineSeparator = sysEol; break;
}
}

private void buildBag() throws IOException {
Expand Down Expand Up @@ -468,7 +521,7 @@ public void writeLine(String line) throws IOException {
if (record) {
lines.add(line);
}
byte[] bytes = (line + "\n").getBytes(ENCODING);
byte[] bytes = (line + lineSeparator).getBytes(ENCODING);
write(bytes);
}

Expand Down
24 changes: 15 additions & 9 deletions src/main/java/edu/mit/lib/bagit/Loader.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import java.util.Map;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.Scanner;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

Expand All @@ -51,6 +52,8 @@ public class Loader {
private final ConcurrentMap<String, String> payloadRefMap = new ConcurrentHashMap<>();
// manifest writer
private LoaderWriter manWriter;
// line separator discovered in Bag text file
private String lineSeparator = System.lineSeparator();

/**
* Returns a new Loader (bag loader) instance using passed
Expand Down Expand Up @@ -107,7 +110,7 @@ public Loader(Path parent, InputStream in, String format) throws IOException {
}

/**
* Returns the checksum algortihm used in bag manifests.
* Returns the checksum algorithm used in bag manifests.
*
* @return algorithm the checksum algorithm
* @throws IOException if error loading bag
Expand Down Expand Up @@ -147,11 +150,10 @@ private void finish() throws IOException {
manWriter.close();
// Update fetch.txt - remove if all holes plugged, else filter
Path refFile = bagFile(REF_FILE);
List<String> refLines = bufferFile(refFile);
if (payloadRefMap.size() > 0) {
// now reconstruct fetch.txt filtering out those resolved
try (OutputStream refOut = Files.newOutputStream(refFile)) {
for (String refline : refLines) {
for (String refline : bufferFile(refFile)) {
String[] parts = refline.split(" ");
if (payloadRefMap.containsKey(parts[2])) {
refOut.write(refline.getBytes(ENCODING));
Expand All @@ -172,10 +174,10 @@ private void finish() throws IOException {
for (String tline : tmLines) {
String[] parts = tline.split(" ");
if (parts[1].startsWith(MANIF_FILE)) {
tagManOut.write((manCS + " " + MANIF_FILE + sfx + "\n").getBytes(ENCODING));
tagManOut.write((manCS + " " + MANIF_FILE + sfx + lineSeparator).getBytes(ENCODING));
} else if (parts[1].startsWith(REF_FILE)) {
if (fetchCS != null) {
tagManOut.write((fetchCS + " " + REF_FILE + sfx + "\n").getBytes(ENCODING));
tagManOut.write((fetchCS + " " + REF_FILE + sfx + lineSeparator).getBytes(ENCODING));
}
} else {
tagManOut.write(tline.getBytes(ENCODING));
Expand All @@ -190,7 +192,7 @@ private List<String> bufferFile(Path file) throws IOException {
try (BufferedReader reader = Files.newBufferedReader(file, StandardCharsets.UTF_8)) {
String line = null;
while ((line = reader.readLine()) != null) {
lines.add(line + "\n");
lines.add(line + lineSeparator);
}
}
Files.delete(file);
Expand Down Expand Up @@ -257,8 +259,12 @@ private Path dataFile(String name) throws IOException {
// lazy initialization of manifest writer
private synchronized LoaderWriter manifestWriter() throws IOException {
if (manWriter == null) {
String sfx = csAlgorithm().toLowerCase() + ".txt";
manWriter = new LoaderWriter(bagFile(MANIF_FILE + sfx), null, true, null);
Path manif = bagFile(MANIF_FILE + csAlgorithm().toLowerCase() + ".txt");
// set line separator for writer to match existing file encoding
try (Scanner sc = new Scanner(manif)) {
lineSeparator = (sc.findWithinHorizon("\r\n", 500) != null) ? "\r\n" : "\n";
}
manWriter = new LoaderWriter(manif, null, true, null);
}
return manWriter;
}
Expand All @@ -270,7 +276,7 @@ private LoaderWriter(Path file, String brPath, boolean append, LoaderWriter tail
}

public void writeLine(String line) throws IOException {
write((line + "\n").getBytes(ENCODING));
write((line + lineSeparator).getBytes(ENCODING));
}
}

Expand Down
72 changes: 72 additions & 0 deletions src/test/java/edu/mit/lib/bagit/BagTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

import static edu.mit.lib.bagit.Bag.*;
import static edu.mit.lib.bagit.Bag.MetadataName.*;
import static edu.mit.lib.bagit.Filler.EolRule.*;

/*
* Basic unit tests for BagIt Library. Incomplete.
Expand Down Expand Up @@ -578,6 +579,77 @@ public void loadedFromStreamValid() throws IOException {
assertTrue(loadedBag.isValid());
}

@Test
public void defaultEOLInTextFiles() throws IOException {
Path bagFile = tempFolder.newFolder("bag32").toPath();
Filler filler = new Filler(bagFile);
OutputStream plout = filler.payloadStream("first.pdf");
for (int i = 0; i < 1000; i++) {
plout.write("lskdflsfevmep".getBytes());
}
Path fullBag = filler.toDirectory();
// use bag-info.txt as representative text-file
Path info = fullBag.resolve("bag-info.txt");
// line ending is same as system-defined one
assertTrue(new Loader(fullBag).load().isValid());
assertTrue(findSeparator(info).equals(System.lineSeparator()));
}

@Test
public void unixEOLInTextFiles() throws IOException {
Path bagFile = tempFolder.newFolder("bag33").toPath();
Filler filler = new Filler(bagFile, "MD5", Filler.EolRule.UNIX);
OutputStream plout = filler.payloadStream("first.pdf");
for (int i = 0; i < 1000; i++) {
plout.write("lskdflsfevmep".getBytes());
}
Path fullBag = filler.toDirectory();
// use bag-info.txt as representative text-file
Path info = fullBag.resolve("bag-info.txt");
// line ending is same as system-defined one
assertTrue(new Loader(fullBag).load().isValid());
assertTrue(findSeparator(info).equals("\n"));
}

@Test
public void windowsEOLInTextFiles() throws IOException {
Path bagFile = tempFolder.newFolder("bag34").toPath();
Filler filler = new Filler(bagFile, "MD5", Filler.EolRule.WINDOWS);
OutputStream plout = filler.payloadStream("first.pdf");
for (int i = 0; i < 1000; i++) {
plout.write("lskdflsfevmep".getBytes());
}
Path fullBag = filler.toDirectory();
// use bag-info.txt as representative text-file
Path info = fullBag.resolve("bag-info.txt");
// line ending is same as system-defined one
assertTrue(new Loader(fullBag).load().isValid());
assertTrue(findSeparator(info).equals("\r\n"));
}

@Test
public void counterEOLInTextFiles() throws IOException {
Path bagFile = tempFolder.newFolder("bag35").toPath();
Filler filler = new Filler(bagFile, "MD5", Filler.EolRule.COUNTER_SYSTEM);
OutputStream plout = filler.payloadStream("first.pdf");
for (int i = 0; i < 1000; i++) {
plout.write("lskdflsfevmep".getBytes());
}
Path fullBag = filler.toDirectory();
// use bag-info.txt as representative text-file
Path info = fullBag.resolve("bag-info.txt");
// line ending is not the same as system-defined one
assertTrue(new Loader(fullBag).load().isValid());
assertTrue(! findSeparator(info).equals(System.lineSeparator()));
}

private String findSeparator(Path file) throws IOException {
try (Scanner scanner = new Scanner(file)) {
// it's one or the other
return (scanner.findWithinHorizon("\r\n", 500) != null) ? "\r\n" : "\n";
}
}

private int lineCount(Path file) throws IOException {
Scanner scanner = new Scanner(file);
int count = 0;
Expand Down

0 comments on commit 006ca44

Please sign in to comment.