Skip to content

Commit

Permalink
Merge pull request #117 from rhubner/url-decode
Browse files Browse the repository at this point in the history
Url decode
  • Loading branch information
valydia committed May 26, 2016
2 parents 4726be4 + aa952fc commit b6ff075
Show file tree
Hide file tree
Showing 17 changed files with 275 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ import uk.gov.nationalarchives.csv.validator.schema.{Schema, SchemaParser}
import scalaz._, Scalaz._
import scalax.file.Path
import uk.gov.nationalarchives.csv.validator._
import java.io.{Reader => JReader, File}
import java.nio.charset.{Charset => JCharset}
import _root_.java.io.{Reader => JReader, File}
import _root_.java.nio.charset.{Charset => JCharset}

object CsvValidator {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import util.parsing.input.Positional
case class Schema(globalDirectives: List[GlobalDirective], columnDefinitions: List[ColumnDefinition], version: String = Schema.version)

object Schema {
val version = "1.1"
val version = "1.2"
}

abstract class GlobalDirective(val name: String) extends Positional
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ package uk.gov.nationalarchives.csv.validator.schema

import uk.gov.nationalarchives.csv.validator.schema.v1_0.{SchemaParser => SchemaParser1_0}
import uk.gov.nationalarchives.csv.validator.schema.v1_1.{SchemaParser => SchemaParser1_1, _}
import uk.gov.nationalarchives.csv.validator.schema.v1_2.{SchemaParser => SchemaParser1_2, _}

import scala.util.parsing.combinator._
import scala.language.reflectiveCalls
Expand Down Expand Up @@ -147,6 +148,18 @@ with TraceableParsers {

SchemaValidator.versionValid(version).map(Failure(_, next)).getOrElse {
version match {
case "1.2" =>
val parser1_2 = new SchemaParser1_2 {override val enforceCaseSensitivePathChecks: Boolean = ecspc
override val pathSubstitutions: List[(String, String)] = ps
override val trace: Boolean = t
}

parser1_2.parseVersionAware(reader) match {
case parser1_2.Success(s, n) => Success(s, n)
case parser1_2.Failure(msg, n) => Failure(msg, n)
case parser1_2.Error(msg, n) => Error(msg, n)
}

case "1.1" =>
val parser1_1 = new SchemaParser1_1 {override val enforceCaseSensitivePathChecks: Boolean = ecspc
override val pathSubstitutions: List[(String, String)] = ps
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/**
* Copyright (c) 2013, The National Archives <digitalpreservation@nationalarchives.gov.uk>
* http://www.nationalarchives.gov.uk
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
package uk.gov.nationalarchives.csv.validator.schema.v1_2

import uk.gov.nationalarchives.csv.validator.metadata.Row
import uk.gov.nationalarchives.csv.validator.schema.{Schema, ArgProvider}
import java.net.{URLDecoder => JURLDecoder}

case class UriDecode(value: ArgProvider, charset: Option[ArgProvider]) extends ArgProvider {

val DefaultCharset = "UTF-8"

override def referenceValue(columnIndex: Int, row: Row, schema: Schema): Option[String] = value.referenceValue(columnIndex, row, schema).map( value => {

val codepage = charset.flatMap(x => x.referenceValue(columnIndex, row, schema)).getOrElse(DefaultCharset)

JURLDecoder.decode(value,codepage)

})

override def toError: String = "uriDecode(" + value.toError + ")"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/**
* Copyright (c) 2013, The National Archives <digitalpreservation@nationalarchives.gov.uk>
* http://www.nationalarchives.gov.uk
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
package uk.gov.nationalarchives.csv.validator.schema.v1_2

import scala.language.reflectiveCalls
import uk.gov.nationalarchives.csv.validator.schema.{Literal, ArgProvider}
import uk.gov.nationalarchives.csv.validator.schema.v1_1.{SchemaParser => SchemaParser1_1}

trait SchemaParser extends SchemaParser1_1 {

/**
* [59] StringProvider ::= ColumnRef | StringLiteral
*/
override lazy val stringProvider: PackratParser[ArgProvider] = "StringProvider" ::= noext | concat | urlDecode | columnRef | stringLiteral ^^ {
s => Literal(Some(s))
}

lazy val urlDecode: PackratParser[ArgProvider] = "UriDecode" ::= "uriDecode(" ~> stringProvider ~ opt("," ~> stringProvider) <~ ")" ^^ {
case value ~ charset => UriDecode(value, charset)
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
version 1.2
@totalColumns 2 @noHeader
identifier:
filename: in(uriDecode($identifier))
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
file:/some/folder/some%21file.txt,some file.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
file:/some/folder/some%20file.txt,some file.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
version 1.2
@totalColumns 3 @noHeader
identifier:
filename: in(uriDecode($identifier, $charset))
charset:
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
file:/some/folder/some%20file.txt,some file.txt,UTF-8
file:/some/folder/text%9Atext.txt,textštext.txt,windows-1252
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
version 1.2
@totalColumns 33
/*---------------------------------------------------------------------------------------------------------------
|This schema is for the validation of technical acquisition metadata |
|csv files according to the specification given for digitised surrogates in |
|http://www.nationalarchives.gov.uk/documents/information-management/digitisation-at-the-national-archives.pdf |
|This version is for WO 95 digitisation in the period 2014-15 |
| 20140818 Version 1.0 DHU First release version for this project |
| 20140910 Version 1.1 DHU Updated date regex to fix issues, allowed items up to 14, disallow fullstops |
|at end of description as this causes search issues in Discovery. |
| 20141016 version 1.2 NW Updated regex to allow 20 items, 500 ordinals & addition of legal_status |
|and held_by fields, changed date column to covering_date |
| 20141110 version 1.3 NW fixed sub_sub_series rule |
|from sub_sub_series: range(1,7) or is("115") or if($piece/is("5500"),is("")) |
|to sub_sub_series: if($piece/is("5500"),is(""),(range(1,7) or is("115"))) |
| 20160511 - RH - Update schema version to test CSV validator backward compatibility |
---------------------------------------------------------------------------------------------------------------*/
/*The header of the schema file, ie the statements version 1.0 and @totalColumns 31, indicates that this schema
is using version 1.0 of the schema language (NB, not that that it is version 1.0 of this particular schema),
and that there are 31 columns in total in the file.*/
batch_code: length(1,11) regex("^WO95Y14B([0-9]{3}|smp)$")
//1st part, batch_code must be between 1 and 11 characters long, and (implicitly multiple conditions are joined
//by a logical AND unless another boolean is provided). 2nd part restricts to form similar to WO95Y14B000 (last
//three digits are running number for batches throughout the project.
department: is("WO") and (in($file_path) and in($resource_uri))
//Parentheses control evaluation order of booleans as might be expected
//Department is fixed value of WO for this project.
//The grouped "in" statements say that the value found in this field must also be found as part of the fields
//"file_path" and "resource_uri"
division: is("13")
//this field must be precisely 13
series: is("95") and (in($file_path) and in($resource_uri))
//Fixed value of 95 for this project
//The value must also be part of the fields "file_path" and "resource_uri"
sub_series: is("1")
//For the 2014-15 project all material to be digitised is in sub_series 1 (France and Flanders)
sub_sub_series: if($piece/is("5500"),is(""),(range(1,7) or is("115")))
//As described in Appendix E of the ITT, the 1914-15 project is scanning material in sub_sub_series 1-7 and 115,
//Piece 5500 is also included which is not in any sub_sub_series, so the value is blank for that piece only.
piece: if($sub_sub_series/is("1"),range(1,85),if($sub_sub_series/is("2"),range(86,153),if($sub_sub_series/is("3"),range(154,267),if($sub_sub_series/is("4"),range(268,358),if($sub_sub_series/is("5"),range(359,430),if($sub_sub_series/is("6"),range(431,517),if($sub_sub_series/is("7"),range(518,571),if($sub_sub_series/is("115"),range(3949,4193),if($sub_sub_series/is(""),is("5500")))))))))) and (in($file_path) and in($resource_uri))
//For this project there is a defined relationship between piece ranges as listed in Appendix E
//This is encapsulated in this rather complex if,then,else statement
//The value must also be part of the fields "file_path" and "resource_uri"
item: (range(1,20) and in($file_path)) or is("")
//Most pieces are subdivided into items, there are not expected to be more than 10 per piece
//The value must also be part of the fields "file_path" and "resource_uri"
//In many cases the item level is not used, so this would be left blank.
//as the sorting/cataloguing process advances this condition may be tightened
ordinal: range(1,500) and in($file_path) unique($department,$division,$series,$sub_series,$sub_sub_series,$piece,$item,$ordinal)
//the ordinal is a simple running count of the images within an item (or piece if not itemised).
//No single item (or piece if not itemised) should contain more than 150 pages but rule changed to 500 to allow for exceptions
//This (with leading zeroes) also forms the final part of the filepath, immediately before the .jp2 extension
//the combination of fields indicated should be unique within the file
description: not("") and regex("^.*[^\.]$")
//description is a fairly free-form field, but must not be empty
covering_date: regex("^19(14|15|16|17|18|19|20|21|22|23)( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)?(-19(14|15|16|17|18|19|20|21|22|23)( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)?)?$")
//dates according to The National Archives' cataloguing standards, expected to be a range for this project, but may be relaxed
legal_status: is("Public Record")
held_by: is("The National Archives, Kew")
file_uuid: uuid4 unique
//must be a version 4 uuid, and the value must be unique within the file. uuids must be lower case.
file_path: uri starts("file:///WO_95/") unique fileExists integrityCheck("excludeFolder")
//fileExists checks that there is actually a file of the given name at the specified location on the file system.
//In practice, the validator will normally be run with the --path switch
//(see http://digital-preservation.github.io/csv-validator/)
//We also require that the path is a valid uri, and begins file:///WO_95/ as this is the top-level folder for each batch
//(Conditions specified on earlier columns say that the values of those columns must also appear as part of the
//content of this field)
//must be unique within the file
file_checksum: unique checksum(file($file_path),"SHA-256")
//Compare the value given in this field to the checksum calculated for the file found at the location given in
//the "file_path" field (again path substitution may well be applied as described for the "file_path" field itself).
//Use the specified checksum algorithm (must use lowercase hex characters).
//unique within the file - an identical checksum would imply identical images
resource_uri: uri starts("http://datagov.nationalarchives.gov.uk/66/WO/95/") unique
//Must be a valid uri which starts with the specified string, the uri is constructed such that it must be unique in the file
//(Conditions specified on earlier columns say that the values of those columns must also appear as part of the
//content of this field)
scan_operator: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
//12 alphanumeric characters representing the identity of the scanning operator (the ability to decode this is
//restricted to the scanning company to avoid personally identifying data being held in the file
scan_id: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
//Like "scan_operator", but this code represents the actual scanner or camera used
scan_location: regex("[-\w\s,.]+")
//Address or other description of the location where scanning physically occurred. The regex allows any number
//of characters, allows general word and whitespace characters plus hyphen, comma and full stop
image_resolution: positiveInteger (is("300") or is("600"))
//Always a positive (non-zero) integer, and in general explicitly 300. Occasionally a higher resolution used.
//Depending how this is populated (whether nominal or actual resolution), it might be better to use a range
//eg range(298,302) to capture slight variances in resolution.
image_width: positiveInteger
//Must be a positive (non-zero) integer. The material in this series is very varied in size, so no checking is attempted beyond this
image_height: positiveInteger
//Must be a positive (non-zero) integer. The material in this series is very varied in size, so no checking is attempted beyond this
image_tonal_resolution: is("24-bit colour")
//must be string: 24-bit colour (precisely - case as shown). Occasionally a different value might be specified.
image_format: is("x-fmt/392")
//must be string: x-fmt/392 (precisely) - ie a jp2 file as understood by PRONOM
//(http://www.nationalarchives.gov.uk/PRONOM/x-fmt/392)
image_compression: positiveInteger is("6")
//Always a positive (non-zero) integer, generally 6 to represent 6-fold compression with the lossy algorithm
//available in the JPEG2000 specification
image_colour_space: is("sRGB")
//must be string: sRGB (precisely - case as shown)
image_split: is("yes") or is("no") or is("composite")
//must be string: yes; or string: no or string: composite (precisely - case as shown). Used if eg an image of complete double page
//subsequently split into two separate images of each page individually, or if an oversize document is imaged as a composite of several images
image_split_ordinal: if($image_split/is("composite"),range(1,9),is(""))
//describes the ordering of the individual "tiles" when an oversize documents has to be imaged in sections as a composite.
//9 is expected to be sufficient, but will be reviewed if required
//if image_split is not composite it must be blank
image_split_other_uuid: if($image_split/is("no"),is(""),regex("^[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-?[a-f0-9]{12}(,[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-?[a-f0-9]{12}){0,8}$"))
//if "image_split" field is no, must be blank
//else it must be a uuid4 or comma separated list of up to 9 uuid4s
//due to the requirement to allow a comma separated list regex has had to be used, rather than the built in uuid4 datatype
image_crop: is("auto") or is("manual") or is("none")
//must be string: auto; or string: manual or string: none (precisely - case as shown)
image_deskew: is("yes") or is("no")
//must be string: yes; or string: no (precisely - case as shown)
comments: regex("[\w\s,.]+") @optional
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,21 @@ class MetaDataValidatorAcceptanceSpec extends Specification with TestResources {
}
}

"Url decode string provider" should {
"decode url string to normal string" in {
validate(TextFile(Path.fromString(base) / "uriDecodePass.csv"), parse(base + "/uriDecode.csvs"), None).isSuccess mustEqual true
}

"fail for wrong url" in {
validate(TextFile(Path.fromString(base) / "uriDecodeFail.csv"), parse(base + "/uriDecode.csvs"), None).isFailure mustEqual true
}

"decode URL with optional charset parameter" in {

validate(TextFile(Path.fromString(base) / "uriDecodeWithCharsetPass.csv"), parse(base + "/uriDecodeWithCharset.csvs"), None).isSuccess mustEqual true
}
}

"Concat string provider" should {
"should concatenate string provider" in {
validate(TextFile(Path.fromString(base) / "concatPass.csv"), parse(base + "/concat.csvs"), None).isSuccess mustEqual true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,13 @@ class MetaDataValidatorIntegrityCheckSpec extends Specification with TestResourc
validator.validate(TextFile(Path.fromString(WO95Path) / "tech_acq_metadata_v1_WO95Y14B003.csv"), parse(WO95Path + "/tech_acq_metadata_v1_WO95Y14B000.csvs",validator), None).isSuccess mustEqual true
}

"Validate WO 95 with 1.2 schema version to test backward compatibility" in {

val substitutionPaths = List(("file:///WO_95",WO95Path))
val validator = buildValidator(substitutionPaths)
validator.validate(TextFile(Path.fromString(WO95Path) / "tech_acq_metadata_v1_WO95Y14B003.csv"), parse(WO95Path + "/tech_acq_metadata_v1_WO95Y14B000_v1.2.csvs",validator), None).isSuccess mustEqual true
}


"succeed with alternative substitution paths - header" in {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class UtilSpec extends Specification with TestResources {

val apiFiles = Util.findAllFiles(true, new File(acceptancePath))

apiFiles must haveLength(120)
apiFiles must haveLength(125)

apiFiles must contain (new File(s"$basePath/uk/gov/nationalarchives/csv/validator/acceptance/twoRulesPassMetaData.csv"))

Expand All @@ -65,7 +65,7 @@ class UtilSpec extends Specification with TestResources {

val integrityCheckFiles = Util.findAllFiles(true, new File(base))

integrityCheckFiles must haveLength(42)
integrityCheckFiles must haveLength(43)

integrityCheckFiles must contain (new File(s"$basePath/uk/gov/nationalarchives/csv/validator/integrityCheck/header/integrityCheckSchema.csvs"))

Expand All @@ -82,7 +82,7 @@ class UtilSpec extends Specification with TestResources {

val integrityCheckFilesNoFolder = Util.findAllFiles(false, new File(base))

integrityCheckFilesNoFolder must haveLength(28)
integrityCheckFilesNoFolder must haveLength(29)

integrityCheckFilesNoFolder must contain (new File(s"$basePath/uk/gov/nationalarchives/csv/validator/integrityCheck/header/content/file1"))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ trait SchemaSpecBase extends Specification {
def buildSchema1_1(globalDirective: GlobalDirective*)(columnDefinition: ColumnDefinition*) =
Schema(globalDirective.toList, columnDefinition.toList, "1.1")

def buildSchema1_2(globalDirective: GlobalDirective*)(columnDefinition: ColumnDefinition*) =
Schema(globalDirective.toList, columnDefinition.toList, "1.2")

def namedColumn(name: String) = ColumnDefinition(NamedColumnIdentifier(name))

def nonEmptyColumn(name: String): ColumnDefinition =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class SchemaParserVersionSpec extends SchemaSpecBase with TestResources{
LastName: @IgnoreCase regex ("[a]")"""

parse(new StringReader(schema)) must beLike {
case Failure(messages, _) => messages mustEqual s"Invalid schema version. This version of the csv validator supports only 1.1 and below."
case Failure(messages, _) => messages mustEqual s"Invalid schema version. This version of the csv validator supports only 1.2 and below."
}
}

Expand Down
Loading

0 comments on commit b6ff075

Please sign in to comment.