-
Notifications
You must be signed in to change notification settings - Fork 55
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #117 from rhubner/url-decode
Url decode
- Loading branch information
Showing
17 changed files
with
275 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
28 changes: 28 additions & 0 deletions
28
...idator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/v1_2/Schema.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
/** | ||
* Copyright (c) 2013, The National Archives <digitalpreservation@nationalarchives.gov.uk> | ||
* http://www.nationalarchives.gov.uk | ||
* | ||
* This Source Code Form is subject to the terms of the Mozilla Public | ||
* License, v. 2.0. If a copy of the MPL was not distributed with this | ||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. | ||
*/ | ||
package uk.gov.nationalarchives.csv.validator.schema.v1_2 | ||
|
||
import uk.gov.nationalarchives.csv.validator.metadata.Row | ||
import uk.gov.nationalarchives.csv.validator.schema.{Schema, ArgProvider} | ||
import java.net.{URLDecoder => JURLDecoder} | ||
|
||
case class UriDecode(value: ArgProvider, charset: Option[ArgProvider]) extends ArgProvider { | ||
|
||
val DefaultCharset = "UTF-8" | ||
|
||
override def referenceValue(columnIndex: Int, row: Row, schema: Schema): Option[String] = value.referenceValue(columnIndex, row, schema).map( value => { | ||
|
||
val codepage = charset.flatMap(x => x.referenceValue(columnIndex, row, schema)).getOrElse(DefaultCharset) | ||
|
||
JURLDecoder.decode(value,codepage) | ||
|
||
}) | ||
|
||
override def toError: String = "uriDecode(" + value.toError + ")" | ||
} |
28 changes: 28 additions & 0 deletions
28
...-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/v1_2/SchemaParser.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
/** | ||
* Copyright (c) 2013, The National Archives <digitalpreservation@nationalarchives.gov.uk> | ||
* http://www.nationalarchives.gov.uk | ||
* | ||
* This Source Code Form is subject to the terms of the Mozilla Public | ||
* License, v. 2.0. If a copy of the MPL was not distributed with this | ||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. | ||
*/ | ||
package uk.gov.nationalarchives.csv.validator.schema.v1_2 | ||
|
||
import scala.language.reflectiveCalls | ||
import uk.gov.nationalarchives.csv.validator.schema.{Literal, ArgProvider} | ||
import uk.gov.nationalarchives.csv.validator.schema.v1_1.{SchemaParser => SchemaParser1_1} | ||
|
||
trait SchemaParser extends SchemaParser1_1 { | ||
|
||
/** | ||
* [59] StringProvider ::= ColumnRef | StringLiteral | ||
*/ | ||
override lazy val stringProvider: PackratParser[ArgProvider] = "StringProvider" ::= noext | concat | urlDecode | columnRef | stringLiteral ^^ { | ||
s => Literal(Some(s)) | ||
} | ||
|
||
lazy val urlDecode: PackratParser[ArgProvider] = "UriDecode" ::= "uriDecode(" ~> stringProvider ~ opt("," ~> stringProvider) <~ ")" ^^ { | ||
case value ~ charset => UriDecode(value, charset) | ||
} | ||
|
||
} |
4 changes: 4 additions & 0 deletions
4
...r-core/src/test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecode.csvs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
version 1.2 | ||
@totalColumns 2 @noHeader | ||
identifier: | ||
filename: in(uriDecode($identifier)) |
1 change: 1 addition & 0 deletions
1
...ore/src/test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecodeFail.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
file:/some/folder/some%21file.txt,some file.txt |
1 change: 1 addition & 0 deletions
1
...ore/src/test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecodePass.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
file:/some/folder/some%20file.txt,some file.txt |
5 changes: 5 additions & 0 deletions
5
...test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecodeWithCharset.csvs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
version 1.2 | ||
@totalColumns 3 @noHeader | ||
identifier: | ||
filename: in(uriDecode($identifier, $charset)) | ||
charset: |
2 changes: 2 additions & 0 deletions
2
...t/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecodeWithCharsetPass.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
file:/some/folder/some%20file.txt,some file.txt,UTF-8 | ||
file:/some/folder/text%9Atext.txt,textštext.txt,windows-1252 |
120 changes: 120 additions & 0 deletions
120
...nalarchives/csv/validator/integrityCheck/WO_95/tech_acq_metadata_v1_WO95Y14B000_v1.2.csvs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
version 1.2 | ||
@totalColumns 33 | ||
/*--------------------------------------------------------------------------------------------------------------- | ||
|This schema is for the validation of technical acquisition metadata | | ||
|csv files according to the specification given for digitised surrogates in | | ||
|http://www.nationalarchives.gov.uk/documents/information-management/digitisation-at-the-national-archives.pdf | | ||
|This version is for WO 95 digitisation in the period 2014-15 | | ||
| 20140818 Version 1.0 DHU First release version for this project | | ||
| 20140910 Version 1.1 DHU Updated date regex to fix issues, allowed items up to 14, disallow fullstops | | ||
|at end of description as this causes search issues in Discovery. | | ||
| 20141016 version 1.2 NW Updated regex to allow 20 items, 500 ordinals & addition of legal_status | | ||
|and held_by fields, changed date column to covering_date | | ||
| 20141110 version 1.3 NW fixed sub_sub_series rule | | ||
|from sub_sub_series: range(1,7) or is("115") or if($piece/is("5500"),is("")) | | ||
|to sub_sub_series: if($piece/is("5500"),is(""),(range(1,7) or is("115"))) | | ||
| 20160511 - RH - Update schema version to test CSV validator backward compatibility | | ||
---------------------------------------------------------------------------------------------------------------*/ | ||
/*The header of the schema file, ie the statements version 1.0 and @totalColumns 31, indicates that this schema | ||
is using version 1.0 of the schema language (NB, not that that it is version 1.0 of this particular schema), | ||
and that there are 31 columns in total in the file.*/ | ||
batch_code: length(1,11) regex("^WO95Y14B([0-9]{3}|smp)$") | ||
//1st part, batch_code must be between 1 and 11 characters long, and (implicitly multiple conditions are joined | ||
//by a logical AND unless another boolean is provided). 2nd part restricts to form similar to WO95Y14B000 (last | ||
//three digits are running number for batches throughout the project. | ||
department: is("WO") and (in($file_path) and in($resource_uri)) | ||
//Parentheses control evaluation order of booleans as might be expected | ||
//Department is fixed value of WO for this project. | ||
//The grouped "in" statements say that the value found in this field must also be found as part of the fields | ||
//"file_path" and "resource_uri" | ||
division: is("13") | ||
//this field must be precisely 13 | ||
series: is("95") and (in($file_path) and in($resource_uri)) | ||
//Fixed value of 95 for this project | ||
//The value must also be part of the fields "file_path" and "resource_uri" | ||
sub_series: is("1") | ||
//For the 2014-15 project all material to be digitised is in sub_series 1 (France and Flanders) | ||
sub_sub_series: if($piece/is("5500"),is(""),(range(1,7) or is("115"))) | ||
//As described in Appendix E of the ITT, the 1914-15 project is scanning material in sub_sub_series 1-7 and 115, | ||
//Piece 5500 is also included which is not in any sub_sub_series, so the value is blank for that piece only. | ||
piece: if($sub_sub_series/is("1"),range(1,85),if($sub_sub_series/is("2"),range(86,153),if($sub_sub_series/is("3"),range(154,267),if($sub_sub_series/is("4"),range(268,358),if($sub_sub_series/is("5"),range(359,430),if($sub_sub_series/is("6"),range(431,517),if($sub_sub_series/is("7"),range(518,571),if($sub_sub_series/is("115"),range(3949,4193),if($sub_sub_series/is(""),is("5500")))))))))) and (in($file_path) and in($resource_uri)) | ||
//For this project there is a defined relationship between piece ranges as listed in Appendix E | ||
//This is encapsulated in this rather complex if,then,else statement | ||
//The value must also be part of the fields "file_path" and "resource_uri" | ||
item: (range(1,20) and in($file_path)) or is("") | ||
//Most pieces are subdivided into items, there are not expected to be more than 10 per piece | ||
//The value must also be part of the fields "file_path" and "resource_uri" | ||
//In many cases the item level is not used, so this would be left blank. | ||
//as the sorting/cataloguing process advances this condition may be tightened | ||
ordinal: range(1,500) and in($file_path) unique($department,$division,$series,$sub_series,$sub_sub_series,$piece,$item,$ordinal) | ||
//the ordinal is a simple running count of the images within an item (or piece if not itemised). | ||
//No single item (or piece if not itemised) should contain more than 150 pages but rule changed to 500 to allow for exceptions | ||
//This (with leading zeroes) also forms the final part of the filepath, immediately before the .jp2 extension | ||
//the combination of fields indicated should be unique within the file | ||
description: not("") and regex("^.*[^\.]$") | ||
//description is a fairly free-form field, but must not be empty | ||
covering_date: regex("^19(14|15|16|17|18|19|20|21|22|23)( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)?(-19(14|15|16|17|18|19|20|21|22|23)( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)?)?$") | ||
//dates according to The National Archives' cataloguing standards, expected to be a range for this project, but may be relaxed | ||
legal_status: is("Public Record") | ||
held_by: is("The National Archives, Kew") | ||
file_uuid: uuid4 unique | ||
//must be a version 4 uuid, and the value must be unique within the file. uuids must be lower case. | ||
file_path: uri starts("file:///WO_95/") unique fileExists integrityCheck("excludeFolder") | ||
//fileExists checks that there is actually a file of the given name at the specified location on the file system. | ||
//In practice, the validator will normally be run with the --path switch | ||
//(see http://digital-preservation.github.io/csv-validator/) | ||
//We also require that the path is a valid uri, and begins file:///WO_95/ as this is the top-level folder for each batch | ||
//(Conditions specified on earlier columns say that the values of those columns must also appear as part of the | ||
//content of this field) | ||
//must be unique within the file | ||
file_checksum: unique checksum(file($file_path),"SHA-256") | ||
//Compare the value given in this field to the checksum calculated for the file found at the location given in | ||
//the "file_path" field (again path substitution may well be applied as described for the "file_path" field itself). | ||
//Use the specified checksum algorithm (must use lowercase hex characters). | ||
//unique within the file - an identical checksum would imply identical images | ||
resource_uri: uri starts("http://datagov.nationalarchives.gov.uk/66/WO/95/") unique | ||
//Must be a valid uri which starts with the specified string, the uri is constructed such that it must be unique in the file | ||
//(Conditions specified on earlier columns say that the values of those columns must also appear as part of the | ||
//content of this field) | ||
scan_operator: length(1,12) regex("^[0-9a-zA-Z]{1,12}$") | ||
//12 alphanumeric characters representing the identity of the scanning operator (the ability to decode this is | ||
//restricted to the scanning company to avoid personally identifying data being held in the file | ||
scan_id: length(1,12) regex("^[0-9a-zA-Z]{1,12}$") | ||
//Like "scan_operator", but this code represents the actual scanner or camera used | ||
scan_location: regex("[-\w\s,.]+") | ||
//Address or other description of the location where scanning physically occurred. The regex allows any number | ||
//of characters, allows general word and whitespace characters plus hyphen, comma and full stop | ||
image_resolution: positiveInteger (is("300") or is("600")) | ||
//Always a positive (non-zero) integer, and in general explicitly 300. Occasionally a higher resolution used. | ||
//Depending how this is populated (whether nominal or actual resolution), it might be better to use a range | ||
//eg range(298,302) to capture slight variances in resolution. | ||
image_width: positiveInteger | ||
//Must be a positive (non-zero) integer. The material in this series is very varied in size, so no checking is attempted beyond this | ||
image_height: positiveInteger | ||
//Must be a positive (non-zero) integer. The material in this series is very varied in size, so no checking is attempted beyond this | ||
image_tonal_resolution: is("24-bit colour") | ||
//must be string: 24-bit colour (precisely - case as shown). Occasionally a different value might be specified. | ||
image_format: is("x-fmt/392") | ||
//must be string: x-fmt/392 (precisely) - ie a jp2 file as understood by PRONOM | ||
//(http://www.nationalarchives.gov.uk/PRONOM/x-fmt/392) | ||
image_compression: positiveInteger is("6") | ||
//Always a positive (non-zero) integer, generally 6 to represent 6-fold compression with the lossy algorithm | ||
//available in the JPEG2000 specification | ||
image_colour_space: is("sRGB") | ||
//must be string: sRGB (precisely - case as shown) | ||
image_split: is("yes") or is("no") or is("composite") | ||
//must be string: yes; or string: no or string: composite (precisely - case as shown). Used if eg an image of complete double page | ||
//subsequently split into two separate images of each page individually, or if an oversize document is imaged as a composite of several images | ||
image_split_ordinal: if($image_split/is("composite"),range(1,9),is("")) | ||
//describes the ordering of the individual "tiles" when an oversize documents has to be imaged in sections as a composite. | ||
//9 is expected to be sufficient, but will be reviewed if required | ||
//if image_split is not composite it must be blank | ||
image_split_other_uuid: if($image_split/is("no"),is(""),regex("^[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-?[a-f0-9]{12}(,[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-?[a-f0-9]{12}){0,8}$")) | ||
//if "image_split" field is no, must be blank | ||
//else it must be a uuid4 or comma separated list of up to 9 uuid4s | ||
//due to the requirement to allow a comma separated list regex has had to be used, rather than the built in uuid4 datatype | ||
image_crop: is("auto") or is("manual") or is("none") | ||
//must be string: auto; or string: manual or string: none (precisely - case as shown) | ||
image_deskew: is("yes") or is("no") | ||
//must be string: yes; or string: no (precisely - case as shown) | ||
comments: regex("[\w\s,.]+") @optional |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.