Skip to content
This repository was archived by the owner on Feb 13, 2023. It is now read-only.

Commit a9b6938

Browse files
authored
Merge pull request #1 from OpenBookPublishers/develop
Develop
2 parents f30855e + 5ca8631 commit a9b6938

File tree

9 files changed

+152
-15
lines changed

9 files changed

+152
-15
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,4 @@
11
*~
2+
*.xml
3+
*.zip
4+
output/

.gitmodules

Lines changed: 0 additions & 3 deletions
This file was deleted.

Dockerfile

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
FROM python:3.8.0-slim-buster
2+
3+
WORKDIR /ebook_automation
4+
5+
# https://github.com/geerlingguy/ansible-role-java/issues/64#issuecomment-393299088
6+
RUN mkdir -p /usr/share/man/man1
7+
RUN apt-get update && \
8+
apt-get install -y openjdk-11-jdk libsaxonb-java zip
9+
RUN rm -rf /var/cache/apt/*
10+
11+
COPY run ./
12+
COPY Extract-citations-from-book.xsl ./
13+
14+
ENV OUTDIR=/ebook_automation/output
15+
16+
CMD bash run file

Extract-citations

Lines changed: 0 additions & 1 deletion
This file was deleted.

Extract-citations-from-book.xsl

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="2.0"
3+
xmlns="http://www.crossref.org/doi_resources_schema/4.4.1"
4+
xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:doi="http://www.crossref.org/schema/4.4.1">
5+
<xsl:output method="xml" indent="yes"/>
6+
<xsl:strip-space elements="*"/>
7+
<xsl:variable name="OBP-base-doi"
8+
select="doc('doi-deposit.xml')/descendant::doi:book_metadata/descendant::doi:doi_data/doi:doi"/>
9+
<xsl:variable name="doi-batch-id" select="doc('doi-deposit.xml')/descendant::doi:doi_batch_id"/>
10+
11+
<xsl:template match="/">
12+
<xsl:result-document href="DOI-citations/{$doi-batch-id}-citations.xml" method="xml">
13+
<doi_batch version="4.1.1" xmlns="http://www.crossref.org/doi_resources_schema/4.1.1"
14+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
15+
xsi:schemaLocation="http://www.crossref.org/doi_resources_schema/4.1.1 http://www.crossref.org/schema/deposit/doi_resources4.1.1.xsd">
16+
<head>
17+
<doi_batch_id>
18+
<xsl:value-of select="$doi-batch-id"/>
19+
<xsl:text>-citations</xsl:text>
20+
</doi_batch_id>
21+
<depositor>
22+
<depositor_name>Open Book Publishers</depositor_name>
23+
<email_address>distribution@openbookpublishers.com</email_address>
24+
</depositor>
25+
</head>
26+
<body>
27+
<doi_citations>
28+
<!-- DOI of the book that contains the citations (extracted from the metadata deposit) -->
29+
<doi>
30+
<xsl:value-of select="$OBP-base-doi"/>
31+
</doi>
32+
<citation_list>
33+
<xsl:for-each select="//tei:bibl">
34+
<xsl:if test="not(parent::tei:figure)">
35+
<!-- Numbering citations from 1 to n -->
36+
<xsl:variable name="number">
37+
<xsl:number level="any"/>
38+
</xsl:variable>
39+
<citation key="ref{$number}">
40+
<xsl:choose>
41+
<!-- When the DOI of the work being cited is known, that is enough info -->
42+
<xsl:when test="./tei:ref/text()[contains(., 'doi.org')]">
43+
<xsl:if test="./tei:ref[1]/text()[contains(., 'doi.org')]">
44+
<xsl:variable name="doi-suffix" select="substring-after(./tei:ref[1]/text(), 'org/')"/>
45+
<doi>
46+
<xsl:value-of select="$doi-suffix"/>
47+
</doi>
48+
</xsl:if>
49+
<xsl:if test="./tei:ref[2]/text()[contains(., 'doi.org')]">
50+
<xsl:variable name="doi-suffix" select="substring-after(./tei:ref[2]/text(), 'org/')"/>
51+
<doi>
52+
<xsl:value-of select="$doi-suffix"/>
53+
</doi>
54+
</xsl:if>
55+
</xsl:when>
56+
<!-- When the DOI of the work being cited is NOT known, we deposit an unstructured citation -->
57+
<xsl:otherwise>
58+
<unstructured_citation>
59+
<xsl:apply-templates/>
60+
<xsl:call-template name="other-ref"/>
61+
</unstructured_citation>
62+
</xsl:otherwise>
63+
</xsl:choose>
64+
</citation>
65+
</xsl:if>
66+
</xsl:for-each>
67+
</citation_list>
68+
</doi_citations>
69+
</body>
70+
</doi_batch>
71+
</xsl:result-document>
72+
</xsl:template>
73+
74+
<!-- Ignore any URL that is not a DOI (as recommended by CrossRef team) -->
75+
<xsl:template name="other-ref" match="//tei:bibl/tei:ref"/>
76+
77+
<!-- Do not include italics (as recommended by CrossRef team)
78+
<xsl:template match="//tei:bibl/tei:hi[contains(@rendition, 'italic')]">
79+
<i>
80+
<xsl:apply-templates/>
81+
</i>
82+
</xsl:template>-->
83+
84+
</xsl:stylesheet>

LICENSE.txt renamed to LICENSE

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -631,8 +631,8 @@ to attach them to the start of each source file to most effectively
631631
state the exclusion of warranty; and each file should have at least
632632
the "copyright" line and a pointer to where the full notice is found.
633633

634-
<one line to give the program's name and a brief idea of what it does.>
635-
Copyright (C) <year> <name of author>
634+
{one line to give the program's name and a brief idea of what it does.}
635+
Copyright (C) {year} {name of author}
636636

637637
This program is free software: you can redistribute it and/or modify
638638
it under the terms of the GNU General Public License as published by
@@ -652,7 +652,7 @@ Also add information on how to contact you by electronic and paper mail.
652652
If the program does terminal interaction, make it output a short
653653
notice like this when it starts in an interactive mode:
654654

655-
<program> Copyright (C) <year> <name of author>
655+
{project} Copyright (C) {year} {fullname}
656656
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657657
This is free software, and you are welcome to redistribute it
658658
under certain conditions; type `show c' for details.

README.md

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,20 @@
22
Wrapper to extract citations from XML editions of OBP books.
33

44
## How to run this tool
5-
### Setup
5+
### Run with docker
6+
```
7+
docker run --rm \
8+
-v /path/to/local/file.xml.zip:/ebook_automation/file.xml.zip \
9+
-v /path/to/local/doi_deposit.xml:/ebook_automation/file.xml \
10+
-v /path/to/output:/ebook_automation/output \
11+
openbookpublishers/obp-extract-cit
12+
```
13+
14+
Alternatively you may clone the repo, build the image using `docker build . -t some/tag` and run the command above replacing `openbookpublishers/obp-extract-cit` with `some/tag`.
15+
16+
17+
### Run locally
18+
#### Setup
619
This wrapper requires `saxonb-xslt` to be installed on your system. On Debian (or Debian-based distributions) this package can be installed via
720

821
`apt-get install libsaxonb-java`
@@ -13,15 +26,41 @@ To perform the setup, run:
1326

1427
The setup contains the necessary instruction to initialise the submodule.
1528

16-
### Run
29+
#### Run
1730
To run the process, place a copy of the **XML edition of the book** and the **DOI deposit** in the _obp-extract-cit_ folder. Finally, run:
1831

1932
`bash run prefix`
2033

2134
where _prefix_ is the name of the book and the DOI deposit files; i.e.: `bash run Siklos-Advanced_Problems2`.
2235

23-
### Clean-up
36+
#### Clean-up
2437

2538
`bash clean [-y]`
2639

27-
would remove temporary files (untracked files and folders stored in the _obp-extract-cit_ folder). The script asks for the user's confirmation before removing the files, but if you are running this as part of a script you might want to use the`-y` flag to bypass the confirmation.
40+
would remove temporary files (untracked files and folders stored in the _obp-extract-cit_ folder). The script asks for the user's confirmation before removing the files, but if you are running this as part of a script you might want to use the`-y` flag to bypass the confirmation.
41+
42+
## Extract-citations
43+
44+
This repository contains a simple tool to extract bibliographic citations from content encoded in XML TEI and creates a file for submission to CrossRef's cited-by service (see the repo's [wiki](https://github.com/OpenBookPublishers/Extract-citations/wiki)).
45+
46+
## Files and directories in this repository
47+
* __Extract-citations-from-book.xsl__: the script that extracts bibliographic citations
48+
* __LICENSE__
49+
* __README.md__: this file
50+
51+
52+
### Extracting citations
53+
This XSL transformation has been developed in conjunction with the conversion tools hosted at https://github.com/OpenBookPublishers/XML-last but can be used on any XML TEI file where bibliographic citations have been encoded as `<bibl>` elements (see http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-bibl.html).
54+
This program:
55+
* individuates every `<bibl>` element within the input file
56+
* extracts and numbers them sequentially
57+
* converts each of them to a `<citation>` or `<unstructured_citation>` element (see the repo's [wiki](https://github.com/OpenBookPublishers/extract-citations/wiki) to read more about the structure of the output file).
58+
59+
To run it:
60+
1. Copy your input files to the project folder:
61+
* the XML TEI file containing the book or article you wish to extract citations from
62+
* 'doi-deposit.xml', a file that records the book or article metadata according to the CrossRef schema, version 4.3.5 or newer ( https://support.crossref.org/hc/en-us/articles/214530063). This is the same file that is often used to register content to the CrossRef database (see https://support.crossref.org/hc/en-us/articles/215577783-Creating-content-registration-XML)
63+
2. Run 'Extract-citations-from-book.xsl'. To run this transformation (XSLT 2.0) a processor such as SaxonHE will be needed (https://sourceforge.net/projects/saxon/files/Saxon-HE/9.8/). Saxon can be run (1) from within a product that provides a graphical user interface (such as oXygen, https://www.oxygenxml.com/), (2) from the command line or (3) from within a Java or .NET application.
64+
* (1) select your input file and the XSL; the output field can be left blank
65+
* (2) type `java -jar _dir_/saxon9he.jar -s:_your_dir_/Extract-citations/_your_input_file_ -xsl:_your_dir_/Extract-citations/Extract-citations-from-book.xsl -o:_your_dir_/Extract-citations/Extract-citations-from-book.xsl`
66+
* (3) see eg http://www.oracle.com/technetwork/java/gazfm-138953.html

run

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ book_name=$1
77
cd $(dirname $0)
88

99
work_dir=$(mktemp -d -t obp-extract-cit-XXXXXX)
10+
out_dir="${OUTDIR:-./}"
1011

1112
cleanup () {
1213
local rv=$?
@@ -19,7 +20,7 @@ trap cleanup EXIT
1920
echo "Copy relevant files to work directory"
2021
unzip -q ${book_name}.xml.zip -d $work_dir
2122
cp ${book_name}.xml $work_dir/doi-deposit.xml
22-
cp Extract-citations/Extract-citations-from-book.xsl $work_dir
23+
cp Extract-citations-from-book.xsl $work_dir
2324

2425
echo "Execute the stylesheet to extract citations"
2526

@@ -28,4 +29,4 @@ echo "Execute the stylesheet to extract citations"
2829
-xsl:Extract-citations-from-book.xsl -ext:on; \
2930
echo "Copy citations back")
3031

31-
cp -r $work_dir/DOI-citations/. .
32+
cp -r $work_dir/DOI-citations/. $out_dir

setup

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,3 @@ check () {
1111
}
1212

1313
check "saxonb-xslt"
14-
15-
git submodule update --init

0 commit comments

Comments
 (0)