Merge pull request #117 from rhubner/url-decode

Url decode
digital-preservation · May 26, 2016 · b6ff075 · b6ff075
2 parents 4726be4 + aa952fc
commit b6ff075
Show file tree

Hide file tree

Showing 17 changed files with 275 additions and 7 deletions.
diff --git a/...alidator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/api/CsvValidator.scala b/...alidator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/api/CsvValidator.scala
@@ -12,8 +12,8 @@ import uk.gov.nationalarchives.csv.validator.schema.{Schema, SchemaParser}
 import scalaz._, Scalaz._
 import scalax.file.Path
 import uk.gov.nationalarchives.csv.validator._
-import java.io.{Reader => JReader, File}
-import java.nio.charset.{Charset => JCharset}
+import _root_.java.io.{Reader => JReader, File}
+import _root_.java.nio.charset.{Charset => JCharset}
 
 object CsvValidator {
 

diff --git a/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/Schema.scala b/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/Schema.scala
@@ -15,7 +15,7 @@ import util.parsing.input.Positional
 case class Schema(globalDirectives: List[GlobalDirective], columnDefinitions: List[ColumnDefinition], version: String = Schema.version)
 
 object Schema {
-  val version = "1.1"
+  val version = "1.2"
 }
 
 abstract class GlobalDirective(val name: String) extends Positional

diff --git a/...dator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/SchemaParser.scala b/...dator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/SchemaParser.scala
@@ -10,6 +10,7 @@ package uk.gov.nationalarchives.csv.validator.schema
 
 import uk.gov.nationalarchives.csv.validator.schema.v1_0.{SchemaParser => SchemaParser1_0}
 import uk.gov.nationalarchives.csv.validator.schema.v1_1.{SchemaParser => SchemaParser1_1, _}
+import uk.gov.nationalarchives.csv.validator.schema.v1_2.{SchemaParser => SchemaParser1_2, _}
 
 import scala.util.parsing.combinator._
 import scala.language.reflectiveCalls
@@ -147,6 +148,18 @@ with TraceableParsers {
 
     SchemaValidator.versionValid(version).map(Failure(_, next)).getOrElse {
       version match {
+        case "1.2" =>
+          val parser1_2 = new SchemaParser1_2 {override val enforceCaseSensitivePathChecks: Boolean = ecspc
+            override val pathSubstitutions: List[(String, String)] = ps
+            override val trace: Boolean = t
+          }
+
+          parser1_2.parseVersionAware(reader) match {
+            case parser1_2.Success(s, n) => Success(s, n)
+            case parser1_2.Failure(msg, n) => Failure(msg, n)
+            case parser1_2.Error(msg, n) => Error(msg, n)
+          }
+
         case "1.1" =>
           val parser1_1 = new SchemaParser1_1 {override val enforceCaseSensitivePathChecks: Boolean = ecspc
             override val pathSubstitutions: List[(String, String)] = ps

diff --git a/...idator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/v1_2/Schema.scala b/...idator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/v1_2/Schema.scala
@@ -0,0 +1,28 @@
+/**
+ * Copyright (c) 2013, The National Archives <digitalpreservation@nationalarchives.gov.uk>
+ * http://www.nationalarchives.gov.uk
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+package uk.gov.nationalarchives.csv.validator.schema.v1_2
+
+import uk.gov.nationalarchives.csv.validator.metadata.Row
+import uk.gov.nationalarchives.csv.validator.schema.{Schema, ArgProvider}
+import java.net.{URLDecoder => JURLDecoder}
+
+case class UriDecode(value: ArgProvider, charset: Option[ArgProvider]) extends ArgProvider {
+
+  val DefaultCharset = "UTF-8"
+
+  override def referenceValue(columnIndex: Int, row: Row, schema: Schema): Option[String] = value.referenceValue(columnIndex, row, schema).map( value => {
+
+    val codepage = charset.flatMap(x => x.referenceValue(columnIndex, row, schema)).getOrElse(DefaultCharset)
+
+    JURLDecoder.decode(value,codepage)
+
+  })
+
+  override def toError: String = "uriDecode(" + value.toError + ")"
+}
diff --git a/...-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/v1_2/SchemaParser.scala b/...-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/v1_2/SchemaParser.scala
@@ -0,0 +1,28 @@
+/**
+ * Copyright (c) 2013, The National Archives <digitalpreservation@nationalarchives.gov.uk>
+ * http://www.nationalarchives.gov.uk
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+package uk.gov.nationalarchives.csv.validator.schema.v1_2
+
+import scala.language.reflectiveCalls
+import uk.gov.nationalarchives.csv.validator.schema.{Literal, ArgProvider}
+import uk.gov.nationalarchives.csv.validator.schema.v1_1.{SchemaParser => SchemaParser1_1}
+
+trait SchemaParser extends SchemaParser1_1 {
+
+  /**
+   * [59] StringProvider ::= ColumnRef | StringLiteral
+   */
+  override lazy val stringProvider: PackratParser[ArgProvider] = "StringProvider" ::= noext | concat | urlDecode | columnRef | stringLiteral ^^ {
+    s => Literal(Some(s))
+  }
+
+  lazy val urlDecode: PackratParser[ArgProvider] = "UriDecode" ::= "uriDecode(" ~> stringProvider ~ opt("," ~> stringProvider)  <~ ")" ^^ {
+    case value ~ charset => UriDecode(value, charset)
+  }
+
+}
diff --git a/...r-core/src/test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecode.csvs b/...r-core/src/test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecode.csvs
@@ -0,0 +1,4 @@
+version 1.2
+@totalColumns 2 @noHeader
+identifier:
+filename: in(uriDecode($identifier))
diff --git a/...ore/src/test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecodeFail.csv b/...ore/src/test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecodeFail.csv
@@ -0,0 +1 @@
+file:/some/folder/some%21file.txt,some file.txt
diff --git a/...ore/src/test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecodePass.csv b/...ore/src/test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecodePass.csv
@@ -0,0 +1 @@
+file:/some/folder/some%20file.txt,some file.txt
diff --git a/...test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecodeWithCharset.csvs b/...test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecodeWithCharset.csvs
@@ -0,0 +1,5 @@
+version 1.2
+@totalColumns 3 @noHeader
+identifier:
+filename: in(uriDecode($identifier, $charset))
+charset:
diff --git a/...t/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecodeWithCharsetPass.csv b/...t/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecodeWithCharsetPass.csv
@@ -0,0 +1,2 @@
+file:/some/folder/some%20file.txt,some file.txt,UTF-8
+file:/some/folder/text%9Atext.txt,textštext.txt,windows-1252
diff --git a/...nalarchives/csv/validator/integrityCheck/WO_95/tech_acq_metadata_v1_WO95Y14B000_v1.2.csvs b/...nalarchives/csv/validator/integrityCheck/WO_95/tech_acq_metadata_v1_WO95Y14B000_v1.2.csvs
@@ -0,0 +1,120 @@
+version 1.2
+@totalColumns 33
+/*---------------------------------------------------------------------------------------------------------------
+|This schema is for the validation of technical acquisition metadata                                            |
+|csv files according to the specification given for digitised surrogates in                                     |
+|http://www.nationalarchives.gov.uk/documents/information-management/digitisation-at-the-national-archives.pdf  |
+|This version is for WO 95 digitisation in the period 2014-15                                                   |
+|  20140818   Version 1.0   DHU   First release version for this project                                        |
+|  20140910   Version 1.1   DHU   Updated date regex to fix issues, allowed items up to 14, disallow fullstops  |
+|at end of description as this causes search issues in Discovery.                                               |
+|  20141016 version 1.2 NW Updated regex to allow 20 items, 500 ordinals & addition of legal_status             |
+|and held_by fields, changed date column to covering_date                                                       |
+|  20141110 version 1.3 NW fixed sub_sub_series rule                                                            |
+|from sub_sub_series: range(1,7) or is("115") or if($piece/is("5500"),is(""))                                   |
+|to sub_sub_series: if($piece/is("5500"),is(""),(range(1,7) or is("115")))                                      |
+|  20160511 - RH - Update schema version to test CSV validator backward compatibility                           |
+---------------------------------------------------------------------------------------------------------------*/
+/*The header of the schema file, ie the statements version 1.0 and @totalColumns 31, indicates that this schema
+  is using version 1.0 of the schema language (NB, not that that it is version 1.0 of this particular schema),
+  and that there are 31 columns in total in the file.*/
+batch_code: length(1,11) regex("^WO95Y14B([0-9]{3}|smp)$")
+  //1st part, batch_code must be between 1 and 11 characters long, and (implicitly multiple conditions are joined
+  //by a logical AND unless another boolean is provided). 2nd part restricts to form similar to WO95Y14B000 (last
+  //three digits are running number for batches throughout the project.
+department: is("WO") and (in($file_path) and in($resource_uri))
+  //Parentheses control evaluation order of booleans as might be expected
+  //Department is fixed value of WO for this project.
+  //The grouped "in" statements say that the value found in this field must also be found as part of the fields
+  //"file_path" and "resource_uri"
+division: is("13")
+  //this field must be precisely 13
+series: is("95") and (in($file_path) and in($resource_uri))
+  //Fixed value of 95 for this project
+  //The value must also be part of the fields "file_path" and "resource_uri"
+sub_series: is("1")
+  //For the 2014-15 project all material to be digitised is in sub_series 1 (France and Flanders)
+sub_sub_series: if($piece/is("5500"),is(""),(range(1,7) or is("115")))
+  //As described in Appendix E of the ITT, the 1914-15 project is scanning material in sub_sub_series 1-7 and 115,
+  //Piece 5500 is also included which is not in any sub_sub_series, so the value is blank for that piece only.
+piece: if($sub_sub_series/is("1"),range(1,85),if($sub_sub_series/is("2"),range(86,153),if($sub_sub_series/is("3"),range(154,267),if($sub_sub_series/is("4"),range(268,358),if($sub_sub_series/is("5"),range(359,430),if($sub_sub_series/is("6"),range(431,517),if($sub_sub_series/is("7"),range(518,571),if($sub_sub_series/is("115"),range(3949,4193),if($sub_sub_series/is(""),is("5500")))))))))) and (in($file_path) and in($resource_uri))
+  //For this project there is a defined relationship between piece ranges as listed in Appendix E
+  //This is encapsulated in this rather complex if,then,else statement
+ //The value must also be part of the fields "file_path" and "resource_uri"
+item: (range(1,20) and in($file_path)) or is("")
+  //Most pieces are subdivided into items, there are not expected to be more than 10 per piece
+  //The value must also be part of the fields "file_path" and "resource_uri"
+  //In many cases the item level is not used, so this would be left blank.
+  //as the sorting/cataloguing process advances this condition may be tightened
+ordinal: range(1,500) and in($file_path) unique($department,$division,$series,$sub_series,$sub_sub_series,$piece,$item,$ordinal)
+  //the ordinal is a simple running count of the images within an item (or piece if not itemised).
+  //No single item (or piece if not itemised) should contain more than 150 pages but rule changed to 500 to allow for exceptions
+  //This (with leading zeroes) also forms the final part of the filepath, immediately before the .jp2 extension
+  //the combination of fields indicated should be unique within the file
+description: not("") and regex("^.*[^\.]$")
+  //description is a fairly free-form field, but must not be empty
+covering_date: regex("^19(14|15|16|17|18|19|20|21|22|23)( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)?(-19(14|15|16|17|18|19|20|21|22|23)( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)?)?$")
+  //dates according to The National Archives' cataloguing standards, expected to be a range for this project, but may be relaxed
+legal_status: is("Public Record")
+held_by: is("The National Archives, Kew")
+file_uuid: uuid4 unique
+  //must be a version 4 uuid, and the value must be unique within the file.  uuids must be lower case.
+file_path: uri starts("file:///WO_95/") unique fileExists integrityCheck("excludeFolder")
+  //fileExists checks that there is actually a file of the given name at the specified location on the file system.
+  //In practice, the validator will normally be run with the --path switch
+  //(see http://digital-preservation.github.io/csv-validator/)
+  //We also require that the path is a valid uri, and begins file:///WO_95/ as this is the top-level folder for each batch
+  //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the
+  //content of this field)
+  //must be unique within the file
+file_checksum: unique checksum(file($file_path),"SHA-256")
+  //Compare the value given in this field to the checksum calculated for the file found at the location given in
+  //the "file_path" field (again path substitution may well be applied as described for the "file_path" field itself).
+  //Use the specified checksum algorithm (must use lowercase hex characters).
+  //unique within the file - an identical checksum would imply identical images
+resource_uri: uri starts("http://datagov.nationalarchives.gov.uk/66/WO/95/") unique
+  //Must be a valid uri which starts with the specified string, the uri is constructed such that it must be unique in the file
+  //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the
+  //content of this field)
+scan_operator: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
+  //12 alphanumeric characters representing the identity of the scanning operator (the ability to decode this is
+  //restricted to the scanning company to avoid personally identifying data being held in the file
+scan_id: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
+  //Like "scan_operator", but this code represents the actual scanner or camera used
+scan_location: regex("[-\w\s,.]+")
+  //Address or other description of the location where scanning physically occurred. The regex allows any number
+  //of characters, allows general word and whitespace characters plus hyphen, comma and full stop
+image_resolution: positiveInteger (is("300") or is("600"))
+  //Always a positive (non-zero) integer, and in general explicitly 300. Occasionally a higher resolution used.
+  //Depending how this is populated (whether nominal or actual resolution), it might be better to use a range
+  //eg range(298,302) to capture slight variances in resolution.
+image_width: positiveInteger
+  //Must be a positive (non-zero) integer. The material in this series is very varied in size, so no checking is attempted beyond this
+image_height: positiveInteger
+  //Must be a positive (non-zero) integer. The material in this series is very varied in size, so no checking is attempted beyond this
+image_tonal_resolution: is("24-bit colour")
+  //must be string: 24-bit colour (precisely - case as shown).  Occasionally a different value might be specified.
+image_format: is("x-fmt/392")
+  //must be string: x-fmt/392 (precisely) - ie a jp2 file as understood by PRONOM
+  //(http://www.nationalarchives.gov.uk/PRONOM/x-fmt/392)
+image_compression: positiveInteger is("6")
+  //Always a positive (non-zero) integer, generally 6 to represent 6-fold compression with the lossy algorithm
+  //available in the JPEG2000 specification
+image_colour_space: is("sRGB")
+  //must be string: sRGB (precisely - case as shown)
+image_split: is("yes") or is("no") or is("composite")
+  //must be string: yes; or string: no or string: composite (precisely - case as shown).  Used if eg an image of complete double page
+  //subsequently split into two separate images of each page individually, or if an oversize document is imaged as a composite of several images
+image_split_ordinal: if($image_split/is("composite"),range(1,9),is(""))
+  //describes the ordering of the individual "tiles" when an oversize documents has to be imaged in sections as a composite.
+  //9 is expected to be sufficient, but will be reviewed if required
+  //if image_split is not composite it must be blank
+image_split_other_uuid: if($image_split/is("no"),is(""),regex("^[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-?[a-f0-9]{12}(,[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-?[a-f0-9]{12}){0,8}$"))
+  //if "image_split" field is no, must be blank
+  //else it must be a uuid4 or comma separated list of up to 9 uuid4s
+  //due to the requirement to allow a comma separated list regex has had to be used, rather than the built in uuid4 datatype
+image_crop: is("auto") or is("manual") or is("none")
+  //must be string: auto; or string: manual or string: none (precisely - case as shown)
+image_deskew: is("yes") or is("no")
+  //must be string: yes; or string: no (precisely - case as shown)
+comments: regex("[\w\s,.]+") @optional
diff --git a/...rc/test/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidatorAcceptanceSpec.scala b/...rc/test/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidatorAcceptanceSpec.scala
@@ -516,6 +516,21 @@ class MetaDataValidatorAcceptanceSpec extends Specification with TestResources {
     }
   }
 
+  "Url decode string provider" should {
+    "decode url string to normal string" in {
+      validate(TextFile(Path.fromString(base) / "uriDecodePass.csv"), parse(base + "/uriDecode.csvs"), None).isSuccess mustEqual true
+    }
+
+    "fail for wrong url" in {
+      validate(TextFile(Path.fromString(base) / "uriDecodeFail.csv"), parse(base + "/uriDecode.csvs"), None).isFailure mustEqual true
+    }
+
+    "decode URL with optional charset parameter" in {
+
+      validate(TextFile(Path.fromString(base) / "uriDecodeWithCharsetPass.csv"), parse(base + "/uriDecodeWithCharset.csvs"), None).isSuccess mustEqual true
+    }
+  }
+
   "Concat string provider" should {
     "should concatenate string provider" in {
       validate(TextFile(Path.fromString(base) / "concatPass.csv"), parse(base + "/concat.csvs"), None).isSuccess mustEqual true

diff --git a/...est/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidatorIntegrityCheckSpec.scala b/...est/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidatorIntegrityCheckSpec.scala
@@ -108,6 +108,13 @@ class MetaDataValidatorIntegrityCheckSpec extends Specification with TestResourc
       validator.validate(TextFile(Path.fromString(WO95Path) / "tech_acq_metadata_v1_WO95Y14B003.csv"), parse(WO95Path + "/tech_acq_metadata_v1_WO95Y14B000.csvs",validator), None).isSuccess mustEqual true
     }
 
+    "Validate WO 95 with 1.2 schema version to test backward compatibility" in {
+
+      val substitutionPaths = List(("file:///WO_95",WO95Path))
+      val validator = buildValidator(substitutionPaths)
+      validator.validate(TextFile(Path.fromString(WO95Path) / "tech_acq_metadata_v1_WO95Y14B003.csv"), parse(WO95Path + "/tech_acq_metadata_v1_WO95Y14B000_v1.2.csvs",validator), None).isSuccess mustEqual true
+    }
+
 
     "succeed with alternative substitution paths - header" in {
 

diff --git a/csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/UtilSpec.scala b/csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/UtilSpec.scala
@@ -53,7 +53,7 @@ class UtilSpec extends Specification with TestResources  {
 
       val apiFiles = Util.findAllFiles(true, new File(acceptancePath))
 
-      apiFiles must haveLength(120)
+      apiFiles must haveLength(125)
 
       apiFiles must contain (new File(s"$basePath/uk/gov/nationalarchives/csv/validator/acceptance/twoRulesPassMetaData.csv"))
 
@@ -65,7 +65,7 @@ class UtilSpec extends Specification with TestResources  {
 
       val integrityCheckFiles =  Util.findAllFiles(true, new File(base))
 
-      integrityCheckFiles  must haveLength(42)
+      integrityCheckFiles  must haveLength(43)
 
       integrityCheckFiles must contain (new File(s"$basePath/uk/gov/nationalarchives/csv/validator/integrityCheck/header/integrityCheckSchema.csvs"))
 
@@ -82,7 +82,7 @@ class UtilSpec extends Specification with TestResources  {
 
       val integrityCheckFilesNoFolder =  Util.findAllFiles(false, new File(base))
 
-      integrityCheckFilesNoFolder  must haveLength(28)
+      integrityCheckFilesNoFolder  must haveLength(29)
 
       integrityCheckFilesNoFolder must contain (new File(s"$basePath/uk/gov/nationalarchives/csv/validator/integrityCheck/header/content/file1"))
 

diff --git a/...tor-core/src/test/scala/uk/gov/nationalarchives/csv/validator/schema/SchemaSpecBase.scala b/...tor-core/src/test/scala/uk/gov/nationalarchives/csv/validator/schema/SchemaSpecBase.scala
@@ -22,6 +22,9 @@ trait SchemaSpecBase extends Specification {
   def buildSchema1_1(globalDirective: GlobalDirective*)(columnDefinition: ColumnDefinition*) =
     Schema(globalDirective.toList, columnDefinition.toList, "1.1")
 
+  def buildSchema1_2(globalDirective: GlobalDirective*)(columnDefinition: ColumnDefinition*) =
+    Schema(globalDirective.toList, columnDefinition.toList, "1.2")
+
   def namedColumn(name: String) = ColumnDefinition(NamedColumnIdentifier(name))
 
   def nonEmptyColumn(name: String): ColumnDefinition =

diff --git a/...est/scala/uk/gov/nationalarchives/csv/validator/schema/v1_1/SchemaParserVersionSpec.scala b/...est/scala/uk/gov/nationalarchives/csv/validator/schema/v1_1/SchemaParserVersionSpec.scala
@@ -51,7 +51,7 @@ class SchemaParserVersionSpec extends SchemaSpecBase with TestResources{
                       LastName: @IgnoreCase regex ("[a]")"""
 
       parse(new StringReader(schema)) must beLike {
-        case Failure(messages, _) => messages mustEqual s"Invalid schema version. This version of the csv validator supports only 1.1 and below."
+        case Failure(messages, _) => messages mustEqual s"Invalid schema version. This version of the csv validator supports only 1.2 and below."
       }
     }