-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmarc2rdf.pl
177 lines (144 loc) · 5.05 KB
/
marc2rdf.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/usr/bin/perl -w
# marc2rdf.pl - transform MARC records to RDF, based on a mapping file in YAML
use MARC::File::USMARC;
use MARC::Record;
use MARC::Field;
use RDF::Redland;
use YAML::Syck;
use feature "switch";
use strict;
# DEBGUG
use Data::Dumper;
# my $yaml = 'mapping-normarc2rdf.yml';
my $yaml = 'normarc-mapping.yml';
my $conf = 'config.yml';
my $marc = 'koha.mrc';
# Load the MARC to RDF mapping
my $maptags = LoadFile($yaml);
my $config = LoadFile($conf);
# Set up some Redland stuff
my $storage = new RDF::Redland::Storage("hashes", "test", "new='yes',hash-type='memory'");
die "Failed to create RDF::Redland::Storage\n" unless $storage;
my $model = new RDF::Redland::Model($storage, "");
die "Failed to create RDF::Redland::Model for storage\n" unless $model;
# Possible formats for the serializer: rdfxml, ntriples, turtle (more?)
my $serializer = new RDF::Redland::Serializer($ARGV[0]);
die "Failed to find serializer\n" if !$serializer;
# Create nodes for the rdf:type predicate
my $p_type = new RDF::Redland::URINode('http://www.w3.org/1999/02/22-rdf-syntax-ns#type');
my $o_type = new RDF::Redland::URINode($config->{'uri'}->{'resource_type'});
# Get the MARC records
my $batch = MARC::File::USMARC->in($marc);
my $count = 0;
# Iterate through our MARC records and convert them
while (my $record = $batch->next()) {
# DEBUG print "\n";
# Construct the subject URI
my $s = new RDF::Redland::URINode(
$config->{'uri'}->{'base'} .
$config->{'uri'}->{'resource_path'} .
$config->{'uri'}->{'resource_prefix'} .
$record->subfield('999',"c")
);
# Set the rdf:type
my $statement = new RDF::Redland::Statement($s, $p_type, $o_type);
$model->add_statement($statement);
# Iterate through all the fields in the record
my @fields = $record->fields();
foreach my $field (@fields) {
my $tag = $field->tag();
if (!$maptags->{'tag'}->{$tag}){
# Skip this field if there is no mapping for it
next;
}
my $fieldmap = $maptags->{'tag'}->{$tag};
if ($field->is_control_field()) {
# DEBUG print $tag , " ", $fieldmap->{'predicate'}, "\n";
_create_triple($s, $field->data(), $fieldmap);
} else {
my @subfields = $field->subfields();
# Iterate through the subfields
foreach my $subfield (@subfields) {
my $subfieldindicator = $subfield->[0];
my $subfieldvalue = $subfield->[1];
if (!$fieldmap->{'subfield'}->{$subfieldindicator}) {
# Skip this subfield if there is no mapping for it
next;
}
my $fieldmap = $fieldmap->{'subfield'}->{$subfieldindicator};
# DEBUG print $tag, " ", $subfieldindicator, " ", $fieldmap->{'predicate'}, "\n";
_create_triple($s, $subfieldvalue, $fieldmap);
}
}
}
$count++;
}
# DEBUG print "$count records done\n\n";
# Serialize the model into the format we set initially
my $base_uri = new RDF::Redland::URINode("http://example.org/");
print $serializer->serialize_model_to_string($base_uri, $model);
sub _create_triple {
my $s = shift;
my $data = shift;
my $map = shift;
# Construct the predicate URI
my $p = new RDF::Redland::URINode($map->{'predicate'});
# DEBUG print "\tp: ", $p->as_string(), "\n";
# Construct the object
# Massage data
if ($map->{'object'}->{'massage'}) {
given($map->{'object'}->{'massage'}) {
when ("isbn") { $data = _isbn($data); }
when ("issn") { $data = _issn($data); }
when ("leading_integer") { $data =~ m/^(\d).*/; $data = $1; }
when ("remove_trailing_punctuation") { $data =~ s/[\.:,;\/\s]\s*$//; }
}
}
# Get data based on a substring
if ($map->{'object'}->{'substr_offset'} && $map->{'object'}->{'substr_length'}) {
my $substr_offset = $map->{'object'}->{'substr_offset'};
my $substr_length = $map->{'object'}->{'substr_length'};
# print "substr: $substr_offset $substr_length\n";
$data = substr $data, $substr_offset, $substr_length;
}
# Prepend the prefix, if there is one
if ($map->{'object'}->{'prefix'}) {
$data = $map->{'object'}->{'prefix'} . $data;
}
# Turn the data into a URI if the datatype is uri
if ($map->{'object'}->{'datatype'} eq "uri") {
$data = new RDF::Redland::URINode($data);
}
my $o = new RDF::Redland::Node($data);
# DEBUG print "\to: ", $o->as_string(), "\n";
# Construct the triple
my $statement = new RDF::Redland::Statement($s, $p, $o);
$model->add_statement($statement);
$statement = undef;
}
sub _isbn {
use Business::ISBN;
my $i = shift;
# Create an ISBN object, this removes any cruft in the data
my $isbn = Business::ISBN->new( $i );
if ($isbn) {
if (!$isbn->is_valid()) { return undef; }
# Make sure it's isbn13
my $isbn13 = $isbn->as_isbn13();
return $isbn13->isbn();
} else {
return undef;
}
}
sub _issn {
use Business::ISSN;
my $i = shift;
# Create an ISSN object, this removes any cruft in the data
my $issn = Business::ISSN->new( $i );
if ($issn) {
if (!$issn->is_valid()) { return undef; }
return $issn->as_string;
} else {
return undef;
}
}