Skip to content

Commit

Permalink
added digitalnz ingestor
Browse files Browse the repository at this point in the history
  • Loading branch information
Conal-Tuohy committed Jul 18, 2017
1 parent 2548ca5 commit 3a63f95
Show file tree
Hide file tree
Showing 4 changed files with 286 additions and 0 deletions.
136 changes: 136 additions & 0 deletions xproc/ingest-digitalnz.xpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
<?xml version="1.0"?>
<!--
Copyright 2017 Conal Tuohy
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<p:declare-step
name="main"
version="1.0"
xmlns:p="http://www.w3.org/ns/xproc"
xmlns:fn="http://www.w3.org/2005/xpath-functions"
xmlns:c="http://www.w3.org/ns/xproc-step"
xmlns:oceania="https://github.com/Conal-Tuohy/oceania"
xmlns:sparql="tag:conaltuohy.com,2017:sparql"
xmlns:pxf="http://exproc.org/proposed/steps/file"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:cx="http://xmlcalabash.com/ns/extensions"
xmlns:sitemap="http://www.sitemaps.org/schemas/sitemap/0.9"
>
<!-- import calabash extension library to enable use of exproc steps -->
<p:import href="http://xmlcalabash.com/extension/steps/library-1.0.xpl"/>

<!-- the SPARQL Protocols -->
<p:import href="sparql.xpl"/>

<p:option name="directory" required="true"/>

<!--<sparql:update query="drop all" service-uri="http://oceania.digital:8080/fuseki/oceania/update"/>-->
<oceania:list-files name="list-of-xml-files-harvested-from-digitalnz">
<p:with-option name="directory" select="$directory"/>
</oceania:list-files>
<!-- <p:filter name="selection-of-files-to-ingest" select="//c:file"/>--><!-- currently ingest all -->
<oceania:ingest/>

<p:declare-step type="oceania:list-files" name="list-files">
<p:output port="result" sequence="true"/>
<p:option name="directory" required="true"/>
<p:directory-list include-filter=".*\.xml">
<p:with-option name="path" select="$directory"/>
</p:directory-list>
<cx:message name="sorting" message="Sorting data files ..."/>
<p:xslt name="sort-files">
<p:input port="parameters"><p:empty/></p:input>
<p:input port="stylesheet">
<p:document href="../xslt/sort-files.xsl"/>
</p:input>
</p:xslt>
</p:declare-step>

<p:declare-step type="oceania:ingest" name="ingest">
<p:input port="manifest"/>
<!-- process each metadata file containing a batch of records-->
<cx:message name="ingesting" message="Ingesting data files ..."/>
<p:for-each name="list-of-files">
<p:iteration-source select="//c:file"/><!-- all the files in the directory listing -->
<p:add-xml-base/>
<cx:message name="ingesting-file">
<p:with-option name="message" select="concat('Ingesting ', /c:file/@name, ' ...')"/>
</cx:message>
<p:load name="batch-of-records">
<p:with-option name="href" select="concat(/c:file/@xml:base, /c:file/@name)"/>
</p:load>
<!-- Catch any "-url" fields which don't start with "http" and re-encode their text value as a data: URI -->
<cx:message name="encoding-data-uris" message="Encoding text as data: URIs ..."/>
<p:string-replace name="convert-free-text-to-data-uri"
match="
*
[ends-with(local-name(.), '-url')]
[not(starts-with(., 'http'))]
"
replace="
concat(
'data:text/plain;charset=utf-8,',
encode-for-uri(normalize-space(.))
)
"
/>
<cx:message name="sanitising-uris" message="Sanitising URIs ..."/>
<!-- TODO replace step with an XSLT using xsl:analyze-string to tidy the URIs -->
<p:string-replace name="sanitize-invalid-uri-syntax"
match="*[ends-with(local-name(.), '-url')]/text()"
replace="replace(., '\\', '%5C')"/>
<p:string-replace name="sanitize-invalid-uri-syntax-2"
match="*[ends-with(local-name(.), '-url')]/text()"
replace="replace(., ' ', '%20')"/>
<p:for-each name="list-of-records">
<p:iteration-source select="/search/results/result"/><!-- for debugging; select a particular input file [id='35800503']-->
<!-- process the item record -->
<p:variable name="id" select="/result/id"/>
<p:variable name="graph-uri" select="concat('tag:oceania.digital,2017:', $id)"/>
<cx:message name="log-attempt">
<p:with-option name="message" select="concat('ingesting record ', $id, ' as ', $graph-uri, ' ...')"/>
</cx:message>
<p:xslt name="transform-digitalnz-result-to-rdf-graph">
<p:input port="parameters"><p:empty/></p:input>
<p:input port="stylesheet">
<p:document href="../xslt/digitalnz-result-to-rdf.xsl"/>
</p:input>
</p:xslt>
<sparql:store-graph graph-store="http://localhost:8080/fuseki/oceania/data">
<p:with-option name="graph-uri" select="$graph-uri"/>
</sparql:store-graph>
<cx:message name="log-result">
<p:with-option name="message" select="concat('Result code: ', /c:response/@status)"/>
</cx:message>
<p:choose>
<p:when test="starts-with(/c:response/@status, '2')">
<!-- 👌 Any kind of 200-level response is good — the graph was created or updated successfully -->
<p:sink/>
</p:when>
<p:otherwise>
<!--
😣 Some kind of error occurred — we have either:
‣ a c:response with an unexpected HTTP status code; or
‣ a c:error describing a network error
-->
<p:store indent="true">
<p:with-option name="href" select="concat('../../digitalnz-errors/', $id, '.xml')"/>
</p:store>
</p:otherwise>
</p:choose>
</p:for-each>
</p:for-each>
</p:declare-step>

</p:declare-step>
102 changes: 102 additions & 0 deletions xproc/sparql.xpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
<?xml version="1.0"?>
<!--
Copyright 2016 Conal Tuohy
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<p:library
version="1.0"
xmlns:p="http://www.w3.org/ns/xproc"
xmlns:fn="http://www.w3.org/2005/xpath-functions"
xmlns:c="http://www.w3.org/ns/xproc-step"
xmlns:sparql="tag:conaltuohy.com,2017:sparql"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:pxf="http://exproc.org/proposed/steps/file"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
>

<p:declare-step type="sparql:update" name="sparql-update">
<p:option name="service-uri"/>
<p:option name="query"/>
<p:template name="construct-deletion-request">
<p:with-param name="service-uri" select="$service-uri"/>
<p:with-param name="query" select="$query"/>
<p:input port="source"><p:empty/></p:input>
<p:input port="template">
<p:inline>
<c:request href="{$service-uri}" method="POST" detailed="true">
<c:body content-type="application/sparql-update">{$query}</c:body>
</c:request>
</p:inline>
</p:input>
</p:template>
<p:http-request name="sparql-update-http-post"/>
<p:sink/>
</p:declare-step>

<!-- delete graph -->
<p:declare-step type="sparql:delete-graph" name="delete-graph">
<p:option name="graph-store" required="true"/>
<p:option name="graph-uri" required="true"/>
<p:template name="construct-deletion-request">
<p:with-param name="graph-store" select="$graph-store"/>
<p:with-param name="graph-uri" select="$graph-uri"/>
<p:input port="template">
<p:inline>
<c:request method="DELETE" href="{$graph-store}{$graph-uri}" detailed="true"/>
</p:inline>
</p:input>
<p:input port="source">
<p:empty/>
</p:input>
</p:template>
<p:http-request/>
<p:sink/>
</p:declare-step>

<!-- store graph -->
<p:declare-step type="sparql:store-graph" name="store-graph">
<p:input port="source"/>
<p:output port="result"/>
<p:option name="graph-store" required="true"/>
<p:option name="graph-uri" required="true"/>
<!-- execute an HTTP PUT to store the graph in the graph store at the location specified -->
<p:template name="generate-put-request">
<p:with-param name="graph-store" select="$graph-store"/>
<p:with-param name="graph-uri" select="$graph-uri"/>
<p:input port="source">
<p:pipe step="store-graph" port="source"/>
</p:input>
<p:input port="template">
<p:inline>
<c:request method="PUT" href="{$graph-store}?graph={encode-for-uri($graph-uri)}" detailed="true">
<c:body content-type="application/rdf+xml">{ /* }</c:body>
</c:request>
</p:inline>
</p:input>
</p:template>
<p:try name="submit-request">
<p:group>
<p:http-request/>
</p:group>
<p:catch name="http-connection-failed">
<p:identity>
<p:input port="source">
<p:pipe step="http-connection-failed" port="error"/>
</p:input>
</p:identity>
</p:catch>
</p:try>
</p:declare-step>

</p:library>
34 changes: 34 additions & 0 deletions xslt/digitalnz-result-to-rdf.xsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="2.0"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns="tag:oceania.digital,2017:digitalnz#">
<xsl:template match="/result">
<rdf:RDF xml:base="http://oceania.digital/digitalnz/">
<rdf:Description rdf:about="item-{id}#">
<xsl:variable name="leaf-node-elements" select=".//*[not(*)]"/>
<xsl:for-each select="$leaf-node-elements">
<xsl:element name="{local-name()}">
<xsl:apply-templates select="." mode="type"/>
<xsl:choose>
<xsl:when test="ends-with(local-name(.), '-url')">
<xsl:attribute name="rdf:resource" select="."/>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="."/>
</xsl:otherwise>
</xsl:choose>
</xsl:element>
</xsl:for-each>
</rdf:Description>
</rdf:RDF>
</xsl:template>

<xsl:template mode="type" match="*[@type='integer']">
<xsl:attribute name="rdf:datatype">http://www.w3.org/2001/XMLSchema#int</xsl:attribute>
</xsl:template>
<xsl:template mode="type" match="*[@type='dateTime']">
<xsl:attribute name="rdf:datatype">http://www.w3.org/2001/XMLSchema#dateTime</xsl:attribute>
</xsl:template>
<xsl:template mode="type" match="*"/>

</xsl:stylesheet>

14 changes: 14 additions & 0 deletions xslt/sort-files.xsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="2.0"
xmlns:c="http://www.w3.org/ns/xproc-step"
>
<xsl:template match="*">
<xsl:copy>
<xsl:copy-of select="@*"/>
<xsl:apply-templates>
<xsl:sort select="substring-before(@name, '.')" data-type="number"/>
</xsl:apply-templates>
</xsl:copy>
</xsl:template>

</xsl:stylesheet>

0 comments on commit 3a63f95

Please sign in to comment.