-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmarcConversion.php
executable file
·4746 lines (3815 loc) · 227 KB
/
marcConversion.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<?php
# Class to handle conversion of the data to MARC format
class marcConversion
{
# Getter properties
private $errorHtml = '';
private $marcPreMerge = NULL;
private $sourceRegistry = array ();
private $itemRecords = 0;
private $filterTokens = array ();
private $status = NULL;
# Caches
private $lookupTablesCache = array ();
# Resources
private $isbn;
# Define the merge types
private $mergeTypes = array (
'TIP' => 'Exact Title match and ISSN match, and top answer in Probablistic search',
'TP' => 'Exact Title, but not ISSN, and top answer in Probablistic search',
'IP' => 'ISSN match, but not exact title, and top answer in Probablistic search',
'P' => 'Probable match, unconfirmed, and top answer in Probablistic search',
'C' => 'probable match, Confirmed',
);
# Define the location codes, as regexps
private $locationCodes = array (
# NB First in this list must be the numeric type for the reports to work correctly
'[0-9]{1,3} ?[A-Z]' => 'SPRI-SER', // Serial
'Periodical' => 'SPRI-SER', // Analytics (has parent serial, or has a reference in square brackets to a serial with which it is shelved)
'Archives' => 'SPRI-ARC',
'Atlas' => 'SPRI-ATL',
'Basement' => 'SPRI-BMT',
"Bibliographers' Office" => 'SPRI-BIB',
'Cupboard' => 'SPRI-CBD',
'Folio' => 'SPRI-FOL',
'Large Atlas' => 'SPRI-LAT',
"Librarian's Office" => 'SPRI-LIO',
'Library Office' => 'SPRI-LIO',
'Map Room' => 'SPRI-MAP',
'Pam' => 'SPRI-PAM',
'Picture Library' => 'SPRI-PIC',
'Reference' => 'SPRI-REF',
'Russian' => 'SPRI-RUS',
'Shelf' => 'SPRI-SHF',
'Special Collection' => 'SPRI-SPC',
'Theses' => 'SPRI-THE',
'Digital Repository' => 'SPRI-ELE',
'Electronic Resource \(online\)' => 'SPRI-ELE', // No items any more
"Friends' Room" => 'SPRI-FRI',
'Museum Working Collection' => 'SPRI-MUS',
'Shelved with pamphlets' => 'SPRI-PAM',
'Shelved with monographs' => 'SPRI-SHF',
'Destroyed during audit' => 'IGNORE',
// SPRI-NIS defined in marcConversion code
);
# Define the filter token descriptions
private $filterTokenDescriptions = array (
'SUPPRESS-EXPLICITLY' => 'Record marked specifically to suppress, e.g. pamphlets needing review, etc.',
'SUPPRESS-MISSINGQ' => 'Missing with ?',
'SUPPRESS-MISSINGLOCATIONPERIODICAL' => 'Missing with *location=Periodical unmatched so remaining',
'SUPPRESS-PICTURELIBRARYVIDEO' => 'Picture Library Store videos',
'IGNORE-DESTROYEDCOPIES' => 'Item has been destroyed during audit',
'IGNORE-IGS' => 'IGS locations',
'IGNORE-ELECTRONICREMOTE' => 'Digital records',
'IGNORE-STATUSRECEIVED' => 'Item is being processed, i.e. has been accessioned and is with a bibliographer for classifying and cataloguing',
'IGNORE-STATUSONORDER' => 'Item on order >1 year ago so unlikely to be fulfilled, but item remains desirable and of bibliographic interest',
'IGNORE-NOTINSPRI' => 'Items held not in SPRI',
'IGNORE-LOCATIONUL' => 'Items held at the UL, i.e. elsewhere',
);
# Define known *ks values that represent status values rather than classifications
private $ksStatusTokens = array (
'MISSING',
'PGA', // Record intended for inclusion in next issue of PGA
'X', // Serial (issue(s)) not yet abstracted)
'Y', // Host item with analytics on card catalogue)
'Z', // Host item not yet analyzed)
'C', // Current serial
'D', // Dead serial
);
# Transliteration process note placeholder
private $transliterationProcessNotePlaceholder = '<transliterationProcessNote>';
# Suppression keyword in *status
private $suppressionStatusKeyword = 'SUPPRESS';
# Acquisition date cut-off for on-order -type items; these range from 22/04/1992 to 30/10/2015; the intention of this date is that 'recent' on-order items (intended to be 1 year ago) would be migrated but suppressed, and the rest deleted - however, this needs review; newest is 2016/12/05
private $acquisitionDate = '2015-01-01';
# Supported transliteration upgrade (BGN/PCGN -> Library of Congress) fields, at either (top/bottom) level of a record
# Confirmed other fields not likely, using: `SELECT field, count(*) FROM `catalogue_processed` WHERE field NOT IN ('kw', 'ks', 'abs', 'doslink', 'winlink', 'lang', 'tc', 'tt', 'location') AND recordLanguage = 'Russian' AND `value` LIKE '%ya%' GROUP BY field;`
private $transliterationUpgradeFields = array (
'n1', 'n2', 'nd', // 1xx, 7xx; NB Keep these three together as generate245::classifyNdField() (as called from generate245::statementOfResponsibility() ), and generate245::roleAndSiblings() assumes they will be in sync in terms of transliteration
'to', // 240; NB Then stripped, except records with *lto
't', // 245
'ta', // 246
'pu', // 260 NB *pl not in scope of transliteration - see note in generate260
'ts', // 490
'note', // 505 NB Then stripped, except for 'Contents: ' note records (minus known non-Russian)
// (773 from host) // 773
'ft', // 780
'st', // 785
);
# Define fields for transliteration name matching
private $transliterationNameMatchingFields = array (
'n1',
);
# HTML tags potentially present in output, which will then be stripped
private $htmlTags = array ('<em>', '</em>', '<sub>', '</sub>', '<sup>', '</sup>');
# Constructor
public function __construct ($muscatConversion, $transliteration)
{
# Create class property handles to the parent class
$this->databaseConnection = $muscatConversion->databaseConnection;
$this->settings = $muscatConversion->settings;
$this->applicationRoot = $muscatConversion->applicationRoot;
$this->baseUrl = $muscatConversion->baseUrl;
# Transliteration handles
$this->transliteration = $transliteration;
$this->supportedReverseTransliterationLanguages = $transliteration->getSupportedReverseTransliterationLanguages ();
# Define unicode symbols
$this->doubleDagger = chr(0xe2).chr(0x80).chr(0xa1);
# Get the list of leading articles
$this->leadingArticles = $this->leadingArticles ();
# Load the diacritics table
$this->diacriticsTable = $this->diacriticsTable ();
# Load ISBN support
$this->isbn = new Isbn\Isbn();
# Load authors support
$languageModes = array_merge (array ('default'), array_keys ($this->supportedReverseTransliterationLanguages)); // Feed in the languages list, with 'default' as the first
$this->generateAuthors = new generateAuthors ($this, $languageModes);
# Load generate008 support
$this->generate008 = new generate008 ($this);
# Load generate245 support
$this->generate245 = new generate245 ($this);
# Create a registry of *pu shard language values
$this->puLanguages = $this->getPuLanguages ();
}
# Getter for error HTML string
public function getErrorHtml ()
{
# End if none
if (!$this->errorHtml) {return $this->errorHtml;}
# Assemble and return the HTML
return "\n<p class=\"warning\"><img src=\"/images/icons/exclamation.png\" class=\"icon\" />" . ($this->recordId ? " Record <a href=\"{$this->baseUrl}/records/{$this->recordId}/\">{$this->recordId}</a>: " : '') . "MARC conversion error: {$this->errorHtml}</p>";
}
# Getter for MARC pre-merge
public function getMarcPreMerge ()
{
return $this->marcPreMerge;
}
# Getter for source registry
public function getSourceRegistry ()
{
return $this->sourceRegistry;
}
# Getter for item records count
public function getItemRecords ()
{
return $this->itemRecords;
}
# Getter for filter tokens, as a string
public function getFilterTokensString ()
{
return implode (', ', $this->filterTokens);
}
# Getter for status
public function getStatus ()
{
# Return the appropriate status, based on filter tokens and item records
# Migrate takes priority over additional other(s) if more than one (5 cases, e.g. /records/118221/ - but not directly testable), and then suppress over additional other(s) (no actual cases)
$filterTokensString = $this->getFilterTokensString ();
switch (true) {
case (substr_count ($filterTokensString, 'MIGRATE') && $this->itemRecords > 0): return 'migratewithitem'; // E.g. /records/5749/ (test #935)
case (substr_count ($filterTokensString, 'MIGRATE') && $this->itemRecords == 0): return 'migrate'; // E.g. /records/1130/ (test #936)
case (substr_count ($filterTokensString, 'SUPPRESS-') && $this->itemRecords > 0): return 'suppresswithitem'; // E.g. /records/52260/ (test #937)
case (substr_count ($filterTokensString, 'SUPPRESS-') && $this->itemRecords == 0): return 'suppress'; // E.g. /records/82493/ (test #938)
case (substr_count ($filterTokensString, 'IGNORE-') ): return 'ignore'; // E.g. /records/1282/ (test #939)
}
}
# Getter for definitions
public function getMergeTypes ()
{
return $this->mergeTypes;
}
public function getLocationCodes ()
{
return $this->locationCodes;
}
public function getKsStatusTokens ()
{
return $this->ksStatusTokens;
}
public function getSuppressionStatusKeyword ()
{
return $this->suppressionStatusKeyword;
}
public function getDiacriticsTable ()
{
return $this->diacriticsTable;
}
public function getAcquisitionDate ()
{
return $this->acquisitionDate;
}
public function getTransliterationUpgradeFields ()
{
return $this->transliterationUpgradeFields;
}
public function getTransliterationNameMatchingFields ()
{
return $this->transliterationNameMatchingFields;
}
public function getHtmlTags ()
{
return $this->htmlTags;
}
# Getter for ISBN library handle
public function getIsbn ()
{
return $this->isbn;
}
# Main entry point
# Local documentation at: http://www.lib.cam.ac.uk/libraries/login/bibstandard/bibstandards.htm
public function convertToMarc ($marcParserDefinition, $recordXml, $mergeDefinition = array (), $mergeType = false, $mergeVoyagerId = false, $stripLeaderInMerge = true)
{
# Reset the error string and source registry so that they are clean for each iteration
$this->errorHtml = '';
$this->marcPreMerge = NULL;
$this->sourceRegistry = array ();
# Create fresh containers for 880 reciprocal links for this record
$this->field880subfield6ReciprocalLinks = array (); // This is indexed by the master field, ignoring any mutations within multilines
$this->field880subfield6Index = 0;
$this->field880subfield6FieldInstanceIndex = array ();
# Ensure the second-pass record ID flag is clean; this is used for a second-pass arising from 773 processing where the host does not exist at time of processing
$this->secondPassRecordId = NULL;
# Reset property handles for item records count and filter tokens
$this->itemRecords = 0;
$this->filterTokens = array ();
# Ensure the line-by-line syntax is valid, extract macros, and construct a data structure representing the record
if (!$datastructure = $this->convertToMarc_InitialiseDatastructure ($recordXml, $marcParserDefinition)) {return false;}
# Load the record as a valid XML object
$this->xml = $this->loadXmlRecord ($recordXml);
# Determine the record number, used by several macros
$this->recordId = $this->xPathValue ($this->xml, '//q0');
# End if not all macros are supported
if (!$this->convertToMarc_MacrosAllSupported ($datastructure)) {return false;}
# Determine the record type
$this->recordType = $this->recordType ();
# Up-front, process author fields
$this->authorsFields = $this->generateAuthors->createAuthorsFields ($this->xml);
# Up-front, look up the host record, if any
$this->hostRecord = $this->lookupHostRecord ($this->hostRecordId /* returned by reference */);
# Lookup XPath values from the record which are needed multiple times, for efficiency
$this->form = $this->xPathValue ($this->xml, '(//form)[1]', false);
# Up-front, process *p/*pt to parse into its component parts
$this->pOrPt = $this->parsePOrPt ();
# Perform XPath replacements
if (!$datastructure = $this->convertToMarc_PerformXpathReplacements ($datastructure)) {return false;}
# Expand vertically-repeatable fields
if (!$datastructure = $this->convertToMarc_ExpandVerticallyRepeatableFields ($datastructure)) {return false;}
# Process the record
$record = $this->convertToMarc_ProcessRecord ($datastructure);
# Determine the length, in bytes, which is the first five characters of the 000 (Leader), padded
$bytes = mb_strlen ($record);
$bytes = str_pad ($bytes, 5, '0', STR_PAD_LEFT); // E.g. /records/1003/ has 984 bytes so becomes 00984 (test #229)
$record = preg_replace ('/^LDR (_____)/m', "LDR {$bytes}", $record);
# If required, merge with an existing Voyager record, returning by reference the pre-merge record, and below returning the merged record
if ($mergeType) {
$this->marcPreMerge = $record; // Save original record pre-merge
$record = $this->mergeWithExistingVoyager ($record, $mergeDefinition, $mergeType, $mergeVoyagerId, $stripLeaderInMerge);
}
# Report any UTF-8 problems
if (strlen ($record) && !htmlspecialchars ($record)) { // i.e. htmlspecialchars fails
$this->errorHtml .= 'UTF-8 conversion failed.';
return false;
}
# Do a check to report any case of an invalid subfield indicator
if (preg_match_all ("/{$this->doubleDagger}[^a-z0-9]/u", $record, $matches)) {
$this->errorHtml .= 'Invalid ' . (count ($matches[0]) == 1 ? 'subfield' : 'subfields') . " (" . implode (', ', $matches[0]) . ") detected.";
// Leave the record visible rather than return false
}
# Do a check to report any case where a where 880 fields do not have both a field (starting validly with a $6) and a link back; e.g. /records/1062/ has "245 ## 6880-01" and "880 ## 6245-01/(N" (tests #230, #231)
preg_match_all ("/^880 [0-9#]{2} {$this->doubleDagger}6/m", $record, $matches);
$total880fields = count ($matches[0]);
$total880dollar6Instances = substr_count ($record, "{$this->doubleDagger}6880-");
if ($total880fields != $total880dollar6Instances) {
$this->errorHtml .= "Mismatch in 880 field/link counts ({$total880fields} vs {$total880dollar6Instances}).";
// Leave the record visible rather than return false
}
# Return the record
return $record;
}
# Function to return memory usage
public function memoryUsage ()
{
return round (memory_get_usage () / 1048576, 3);
}
# Getter for second-pass record ID
public function getSecondPassRecordId ()
{
return $this->secondPassRecordId;
}
# Function to get a list of supported macros
public function getSupportedMacros ()
{
# Get the list of matching functions
$methods = get_class_methods ($this);
# Find matches
$macros = array ();
foreach ($methods as $method) {
if (preg_match ('/^macro_([a-zA-Z0-9_]+)/', $method, $matches)) {
$macros[] = $matches[1];
}
}
# Return the list
return $macros;
}
# Function to perform merge of a MARC record with an existing Voyager record
private function mergeWithExistingVoyager ($localRecord, $mergeDefinitions, $mergeType, $mergeVoyagerId, $stripLeaderInMerge)
{
# Return the record unchanged if merging is not enabled
if (!$this->settings['mergingEnabled']) {
return $localRecord;
}
# Start a source registry, to store which source each line comes from
$sourceRegistry = array ();
# End if merge type is unsupported; this will result in an empty record
if (!isSet ($this->mergeTypes[$mergeType])) {
$this->errorHtml .= "Merge failed: unsupported merge type {$mergeType}. The local record has been put in, without merging.";
return $localRecord;
}
# Select the merge definition to use
$mergeDefinition = $mergeDefinitions[$mergeType];
# Get the existing Voyager record
if (!$voyagerRecord = $this->getExistingVoyagerRecord ($mergeVoyagerId, $stripLeaderInMerge)) {
$this->errorHtml .= "Merge failed: could not retrieve existing Voyager record. The local record has been put in, without merging.";
return $localRecord;
}
# Parse out the local MARC record and the Voyager record into nested structures
$localRecordStructure = $this->parseMarcRecord ($localRecord);
$voyagerRecordStructure = $this->parseMarcRecord ($voyagerRecord);
# Create a superset list of all fields across both types of record
$allFieldNumbers = array_merge (array_keys ($localRecordStructure), array_keys ($voyagerRecordStructure));
$allFieldNumbers = array_unique ($allFieldNumbers);
sort ($allFieldNumbers, SORT_NATURAL); // This will order by number but put LDR at the end
$ldr = array_pop ($allFieldNumbers); // Remove LDR from end
array_unshift ($allFieldNumbers, $ldr);
# Create a superstructure, where all fields are present from the superset, sub-indexed by source; if a field is not present it will not be present in the result (test #232)
$superstructure = array ();
foreach ($allFieldNumbers as $fieldNumber) {
$superstructure[$fieldNumber] = array (
'muscat' => (isSet ($localRecordStructure[$fieldNumber]) ? $localRecordStructure[$fieldNumber] : NULL),
'voyager' => (isSet ($voyagerRecordStructure[$fieldNumber]) ? $voyagerRecordStructure[$fieldNumber] : NULL),
);
}
/*
echo "recordId:";
application::dumpData ($this->recordId);
echo "mergeType:";
application::dumpData ($mergeType);
echo "localRecordStructure:";
application::dumpData ($localRecordStructure);
echo "voyagerRecordStructure:";
application::dumpData ($voyagerRecordStructure);
echo "mergeDefinition:";
application::dumpData ($mergeDefinition);
echo "superstructure:";
application::dumpData ($superstructure);
*/
# Perform merge based on the specified strategy
$recordLines = array ();
$i = 0;
foreach ($superstructure as $fieldNumber => $recordPair) {
# By default, assume the lines for this field are copied across into the eventual record from both sources
$muscat = true;
$voyager = true;
# If there is a merge definition, apply its algorithm
if (isSet ($mergeDefinition[$fieldNumber])) {
switch ($mergeDefinition[$fieldNumber]) {
case 'M': // E.g. /records/1033/ (tests #233, #234)
$muscat = true;
$voyager = false;
break;
case 'V': // E.g. /records/10506/ (test #235)
$muscat = false;
$voyager = true;
break;
case 'M else V': // No definitions yet, so no tests
if ($recordPair['muscat']) {
$muscat = true;
$voyager = false;
} else {
$muscat = false;
$voyager = true;
}
break;
case 'V else M': // E.g. /records/1033/ (tests #236, #237)
if ($recordPair['voyager']) {
$muscat = false;
$voyager = true;
} else {
$muscat = true;
$voyager = false;
}
break;
case 'V and M': // E.g. /records/50968/ , /records/12775/ (tests #238, #239, 240, 241)
$muscat = true;
$voyager = true;
break;
}
}
# Extract the full line from each of the local lines
if ($muscat) {
if ($recordPair['muscat']) {
foreach ($recordPair['muscat'] as $recordLine) {
$recordLines[$i] = $recordLine['fullLine'];
$sourceRegistry[$i] = 'M';
$i++;
}
}
}
# Extract the full line from each of the voyager lines
if ($voyager) {
if ($recordPair['voyager']) {
foreach ($recordPair['voyager'] as $recordLine) {
$recordLines[$i] = $recordLine['fullLine'];
$sourceRegistry[$i] = 'V';
$i++;
}
}
}
}
# Implode the record lines
$record = implode ("\n", $recordLines);
# Register the source registry
$this->sourceRegistry = $sourceRegistry;
# Return the merged record
return $record;
}
# Function to obtain the data for an existing Voyager record, as a multi-dimensional array indexed by field then an array of lines for that field
public function getExistingVoyagerRecord ($mergeVoyagerId, $stripLeaderInMerge = true, &$errorText = '')
{
# If the merge voyager ID is not yet a pure integer (i.e. not yet a one-to-one lookup), state this and end
if (!ctype_digit ($mergeVoyagerId)) {
$errorText = 'There is not yet a one-to-one match, so no Voyager record can be displayed.';
return false;
}
# Look up Voyager record, or end (e.g. no match)
if (!$voyagerRecordShards = $this->databaseConnection->select ($this->settings['database'], 'catalogue_external', array ('voyagerId' => $mergeVoyagerId))) {
$errorText = "Error: the specified Voyager record (#{$mergeVoyagerId}) could not be found in the external datasource.";
return false;
}
# Replace spaces with # in the Leader (LDR), to use the same format as generated records; cannot be tested as block below (with test #787) strips this in import
foreach ($voyagerRecordShards as $shardId => $shard) {
if ($shard['field'] == 'LDR') {
$voyagerRecordShards[$shardId]['data'] = str_replace (' ', '#', $shard['data']);
break; // Only one, so stop loop
}
}
# During import (but not in dynamic loading), remove the Leader (LDR) in the merge record, to avoid a double-leader, which causes Bibcheck to fail; e.g. /records/1011/ (test #787)
if ($stripLeaderInMerge) {
foreach ($voyagerRecordShards as $shardId => $shard) {
if ($shard['field'] == 'LDR') {
unset ($voyagerRecordShards[$shardId]);
break; // Only one, so stop loop
}
}
}
# Construct the record lines
$recordLines = array ();
foreach ($voyagerRecordShards as $shard) {
$hasIndicators = (!preg_match ('/^(LDR|00[0-9])$/', $shard['field'])); // E.g. /records/29550/ (tests #242, #243)
$recordLines[] = $shard['field'] . ($hasIndicators ? ' ' . $shard['indicators'] : '') . ' ' . $shard['data'];
}
# Implode to text string
$record = implode ("\n", $recordLines);
# Return the record text block
return $record;
}
# Function to load an XML record string as XML
public function loadXmlRecord ($recordXml)
{
# Load the record as a valid XML object
$xmlProlog = '<' . '?xml version="1.0" encoding="utf-8"?' . '>';
$record = $xmlProlog . "\n<root>" . "\n" . $recordXml . "\n</root>";
$xml = new SimpleXMLElement ($record);
return $xml;
}
# Function to ensure the line-by-line syntax is valid, extract macros, and construct a data structure representing the record
private function convertToMarc_InitialiseDatastructure ($record, $marcParserDefinition)
{
# Convert the definition into lines
$marcParserDefinition = str_replace ("\r\n", "\n", $marcParserDefinition);
$lines = explode ("\n", $marcParserDefinition);
# Strip out comments and empty lines
foreach ($lines as $lineNumber => $line) {
# Skip empty lines
if (!trim ($line)) {unset ($lines[$lineNumber]);}
# Skip comment lines (test #244)
if (mb_substr ($line, 0, 1) == '#') {unset ($lines[$lineNumber]); continue;}
}
# Start the datastructure by loading each line
$datastructure = array ();
foreach ($lines as $lineNumber => $line) {
$datastructure[$lineNumber]['line'] = $line;
}
# Ensure the line-by-line syntax is valid, extract macros, and construct a data structure representing the record
foreach ($lines as $lineNumber => $line) {
# Initialise arrays to ensure attributes for each line are present
$datastructure[$lineNumber]['controlCharacters'] = array ();
$datastructure[$lineNumber]['macros'] = array ();
$datastructure[$lineNumber]['xpathReplacements'] = array ();
# Validate and extract the syntax
if (!preg_match ('/^([AER]*)\s+(([0-9|LDR]{3}) .{3}.+)$/', $line, $matches)) {
$this->errorHtml .= 'Line ' . ($lineNumber + 1) . ' does not have the right syntax.';
return false;
}
# Determine the MARC code; examples are: LDR, 008, 100, 245, 852 etc.
$datastructure[$lineNumber]['marcCode'] = $matches[3];
# Strip away (and cache) the control characters
$datastructure[$lineNumber]['controlCharacters'] = str_split ($matches[1]);
$datastructure[$lineNumber]['line'] = $matches[2];
# Extract all XPath references
preg_match_all ('/' . "({$this->doubleDagger}[a-z0-9])?" . '((R?)(i?){([^}]+)})' . "(\s*?)" /* Use of *? makes this capture ungreedy, so we catch any trailing space(s) */ . '/U', $line, $matches, PREG_SET_ORDER);
foreach ($matches as $match) {
$subfieldIndicator = $match[1]; // e.g. $a (actually a dagger not a $)
$findBlock = $match[2]; // e.g. '{//somexpath}'
$isHorizontallyRepeatable = $match[3]; // The 'R' flag
$isIndicatorBlockMacro = $match[4]; // The 'i' flag
$xpath = $match[5];
$trailingSpace = $match[6]; // Trailing space(s), if any, so that these can be preserved during replacement
# Firstly, register macro requirements by stripping these from the end of the XPath, e.g. {/*/isbn|macro:validisbn|macro:foobar} results in $datastructure[$lineNumber]['macros'][/*/isbn|macro] = array ('xpath' => 'validisbn', 'macrosThisXpath' => 'foobar')
$macrosThisXpath = array ();
while (preg_match ('/^(.+)\|macro:([^|]+)$/', $xpath, $macroMatches)) {
array_unshift ($macrosThisXpath, $macroMatches[2]); // 'macro' does not appear in the result (test #245)
$xpath = $macroMatches[1];
}
if ($macrosThisXpath) {
$datastructure[$lineNumber]['macros'][$findBlock]['macrosThisXpath'] = $macrosThisXpath; // Note that using [xpath]=>macrosThisXpath is not sufficient as lines can use the same xPath more than once
}
# Register the full block; e.g. 'b{//recr} ' ; e.g. /records/1049/ (test #247)
$datastructure[$lineNumber]['xpathReplacements'][$findBlock]['fullBlock'] = $match[0];
# Register the subfield indicator (test #248)
$datastructure[$lineNumber]['xpathReplacements'][$findBlock]['subfieldIndicator'] = $subfieldIndicator;
# Register whether this xPath replacement is in the indicator block; e.g. /records/1108/ (test #250)
$datastructure[$lineNumber]['xpathReplacements'][$findBlock]['isIndicatorBlockMacro'] = (bool) $isIndicatorBlockMacro;
# Register the XPath; e.g. /records/1003/ (test #251)
$datastructure[$lineNumber]['xpathReplacements'][$findBlock]['xPath'] = $xpath;
# If the subfield is horizontally-repeatable, save the subfield indicator that should be used for imploding, resulting in e.g. $aFoo$aBar ; e.g. /records/1010/ (test #252)
$datastructure[$lineNumber]['xpathReplacements'][$findBlock]['horizontalRepeatability'] = ($isHorizontallyRepeatable ? $subfieldIndicator : false);
# Register any trailing space(s); e.g. /records/1049/ (test #246)
$datastructure[$lineNumber]['xpathReplacements'][$findBlock]['trailingSpace'] = $trailingSpace;
}
}
# Return the datastructure
return $datastructure;
}
# Function to check all macros are supported
private function convertToMarc_MacrosAllSupported ($datastructure)
{
# Get the supported macros
$supportedMacros = $this->getSupportedMacros ();
# Work through each line of macros
$unknownMacros = array ();
foreach ($datastructure as $lineNumber => $line) {
foreach ($line['macros'] as $find => $attributes) {
foreach ($attributes['macrosThisXpath'] as $macro) {
$macro = preg_replace ('/^([a-zA-Z0-9_]+)\([^)]+\)/', '\1', $macro); // Strip any prefixed (..) argument
if (!in_array ($macro, $supportedMacros)) {
$unknownMacros[] = $macro;
}
}
}
}
# Report unrecognised macros
if ($unknownMacros) {
$this->errorHtml .= 'Not all macros were recognised: ' . implode (', ', $unknownMacros);
return false;
}
# No problems found
return true;
}
# Function to perform Xpath replacements
# NB XPath functions can have PHP modifications in them using php:functionString - may be useful in future; see: https://www.sitepoint.com/php-dom-using-xpath/ and https://www.cowburn.info/2009/10/23/php-funcs-xpath/
private function convertToMarc_PerformXpathReplacements ($datastructure)
{
# Perform XPath replacements; e.g. /records/1003/ (test #251)
$compileFailures = array ();
foreach ($datastructure as $lineNumber => $line) {
# Determine if the line is vertically-repeatable; e.g. /records/1599/ (test #253)
$isVerticallyRepeatable = (in_array ('R', $datastructure[$lineNumber]['controlCharacters']));
# Work through each XPath replacement
foreach ($line['xpathReplacements'] as $find => $xpathReplacementSpec) {
$xPath = $xpathReplacementSpec['xPath']; // Extract from structure
# Determine if horizontally-repeatable; e.g. /records/1010/ (test #252)
$isHorizontallyRepeatable = (bool) $xpathReplacementSpec['horizontalRepeatability'];
# Deal with fixed strings; e.g. /records/3056/ (test #254)
if (preg_match ("/^'(.+)'$/", $xPath, $matches)) {
$value = array ($matches[1]);
# Handle the special-case where the specified XPath is just '/', representing the whole record; this indicates that the macro will process the record as a whole, ignoring any passed in value; doing this avoids the standard XPath processor resulting in an array of two values of (1) *qo and (2) *doc/*art/*ser ; e.g. /records/3056/ (test #255)
} else if ($xPath == '/') {
$value = array (true); // Ensures the result processor continues, but this 'value' is then ignored
# Otherwise, handle the standard case; e.g. /records/1003/ (test #251)
} else {
# Attempt to parse
$xPathResult = @$this->xml->xpath ('/root' . $xPath);
# Check for compile failures
if ($xPathResult === false) {
$compileFailures[] = $xPath;
continue;
}
# Obtain the value component(s)
$value = array ();
foreach ($xPathResult as $node) {
$value[] = (string) $node;
}
}
# If there was a result process it
if ($value) {
/*
NOTE:
The order of processing here is important.
Below are two steps:
1) Assemble the string components (unless vertically-repeatable/horizontally-repeatable) into a single string:
e.g. {//k/kw} may end up with values 'Foo' 'Bar' 'Zog'
therefore these become imploded to:
FooBarZog
However, if either the R (vertically-repeatable at start of line, or horizontally-repeatable attached to macro) flag is present, then that will be stored as:
array('Foo', 'Bar', 'Zog')
2) Run the value through any macros that have been defined for this XPath on this line
This takes effect on each value now present, i.e.
{//k/kw|macro::dotend} would result in either:
R: FooBarZog.
(not R): array('Foo.', 'Bar.', 'Zog.')
So, currently, the code does the merging first, then macro processing on each element.
*/
# Assemble the string components (unless vertically-repeatable or horizontally-repeatable) into a single string
if (!$isVerticallyRepeatable && !$isHorizontallyRepeatable) {
$value = implode ('', $value);
}
# Run the value through any macros that have been defined for this XPath on this line
if (isSet ($datastructure[$lineNumber]['macros'][$find])) {
# Determine the macro(s) for this Xpath
$macros = $datastructure[$lineNumber]['macros'][$find]['macrosThisXpath'];
# For a vertically-repeatable field, process each value; otherwise process the compiled string
if ($isVerticallyRepeatable || $isHorizontallyRepeatable) {
foreach ($value as $index => $subValue) {
$value[$index] = $this->processMacros ($subValue, $macros);
}
} else {
$value = $this->processMacros ($value, $macros);
}
}
# For horizontally-repeatable fields, if any sub-value has been returned as false, skip it; no known cases, but can be tested against /records/16928/ by changing 876 parser to exceptBegins(Title), which should exclude the first $x without leaving a space (i.e. "$x $xFormerly")
if ($isHorizontallyRepeatable) {
foreach ($value as $index => $subValue) {
if ($subValue === false) {
unset ($value[$index]);
}
}
}
# For horizontally-repeatable fields, apply uniqueness after macro processing; e.g. if Lang1, Lang2, Lang3 becomes translatedlangA, translatedlangB, translatedlangB, unique to translatedlangA, translatedlangB; no examples available
if ($isHorizontallyRepeatable) {
$value = array_unique ($value); // Key numbering may now have holes, but the next operation is imploding anyway
}
# If horizontally-repeatable, compile with the subfield indicator as the implode string, including a space for clarity, e.g. /records/1010/ (test #752)
if ($isHorizontallyRepeatable) {
$value = implode (' ' . $xpathReplacementSpec['horizontalRepeatability'], $value);
}
# Register the processed value
$datastructure[$lineNumber]['xpathReplacements'][$find]['replacement'] = $value; // $value is usually a string, but an array if repeatable
} else { // i.e. !$value :
$datastructure[$lineNumber]['xpathReplacements'][$find]['replacement'] = '';
}
}
}
# If there are compile failures, assemble this into an error message
if ($compileFailures) {
$this->errorHtml .= 'Not all expressions compiled: ' . implode ($compileFailures);
return false;
}
# Return the datastructure
return $datastructure;
}
# Function to expand vertically-repeatable fields
private function convertToMarc_ExpandVerticallyRepeatableFields ($datastructureUnexpanded)
{
$datastructure = array (); // Expanded version, replacing the original
foreach ($datastructureUnexpanded as $lineNumber => $line) {
# If not vertically-repeatable, copy the attributes across unamended, and move on
if (!in_array ('R', $line['controlCharacters'])) {
$datastructure[$lineNumber] = $line;
continue;
}
# For vertically-repeatable, first check the counts are consistent (e.g. if //k/kw generated 7 items, and //k/ks generated 5, throw an error, as behaviour is undefined); no tests possible as this is basically now deprected - no examples in parser left, as groupings all handled by macros now
$counts = array ();
foreach ($line['xpathReplacements'] as $macroBlock => $xpathReplacementSpec) {
$replacementValues = $xpathReplacementSpec['replacement'];
$counts[$macroBlock] = (is_string ($replacementValues) ? 1 : count ($replacementValues)); // Check for is_string to avoid PHP7.2 warning following change in count() ; see: https://php.net/count#example-6224 and https://wiki.php.net/rfc/counting_non_countables
}
if (count (array_count_values ($counts)) != 1) {
$this->errorHtml .= 'Line ' . ($lineNumber + 1) . ' is a vertically-repeatable field, but the number of generated values in the subfields are not consistent:' . application::dumpData ($counts, false, true);
continue;
}
# If there are no values on this line, then no expansion is needed, so copy the attributes across unamended, and move on
if (!$replacementValues) { // Reuse the last replacementValues - it will be confirmed as being the same as all subfields will have
$datastructure[$lineNumber] = $line;
continue;
}
# Determine the number of line expansions (which the above check should ensure is consistent between each of the counts)
$numberOfLineExpansions = application::array_first_value ($counts); // Take the first count only
# Clone the line, one for each subvalue, as-is, assigning a new key (original key, plus the subvalue index)
for ($subLine = 0; $subLine < $numberOfLineExpansions; $subLine++) {
$newLineId = "{$lineNumber}_{$subLine}"; // e.g. 17_0, 17_1 if there are two line expansion
$datastructure[$newLineId] = $line;
}
# Overwrite the subfield value within the structure, so it contains only this subfield value, not the whole array of values
for ($subLine = 0; $subLine < $numberOfLineExpansions; $subLine++) {
$newLineId = "{$lineNumber}_{$subLine}";
foreach ($line['xpathReplacements'] as $macroBlock => $xpathReplacementSpec) {
$datastructure[$newLineId]['xpathReplacements'][$macroBlock]['replacement'] = $xpathReplacementSpec['replacement'][$subLine];
}
}
}
# Return the newly-expanded datastructure; e.g. /records/1599/ (test #253)
return $datastructure;
}
# Function to process the record
private function convertToMarc_ProcessRecord ($datastructure)
{
# Process each line
$outputLines = array ();
foreach ($datastructure as $lineNumber => $attributes) {
$line = $attributes['line'];
# Perform XPath replacements if any, working through each replacement; e.g. /records/1049/ (test #247)
if ($datastructure[$lineNumber]['xpathReplacements']) {
# Start a flag for whether the line has content
$lineHasContent = false;
# Loop through each macro block; e.g. /records/1049/ (test #247)
$replacements = array ();
foreach ($datastructure[$lineNumber]['xpathReplacements'] as $macroBlock => $xpathReplacementSpec) {
$replacementValue = $xpathReplacementSpec['replacement'];
# Determine if there is content
$blockHasValue = strlen ($replacementValue);
# Register replacements
$fullBlock = $xpathReplacementSpec['fullBlock']; // The original block, which includes any trailing space(s), e.g. "a{/*/edn} " ; e.g. if optional block is skipped because of no value then following block will not have a space before: /records/1049/ (test #260)
if ($blockHasValue) {
$replacements[$fullBlock] = $xpathReplacementSpec['subfieldIndicator'] . $replacementValue . $xpathReplacementSpec['trailingSpace'];
} else {
$replacements[$fullBlock] = ''; // Erase the block
}
# Perform control character checks if the macro is a normal (general value-creation) macro, not an indicator block macro
if (!$xpathReplacementSpec['isIndicatorBlockMacro']) {
# If this content macro has resulted in a value, set the line content flag
if ($blockHasValue) {
$lineHasContent = true;
}
# If there is an 'A' (all) control character, require all placeholders to have resulted in text; e.g. /records/3056/ (test #257), /records/3057/ (test #258)
#!# Currently this takes no account of the use of a macro in the nonfiling-character section (e.g. 02), i.e. those macros prefixed with indicators; however in practice that should always return a string
if (in_array ('A', $datastructure[$lineNumber]['controlCharacters'])) {
if (!$blockHasValue) {
continue 2; // i.e. break out of further processing of blocks on this line (as further ones are irrelevant), and skip the whole line registration below
}
}
}
}
# If there is an 'E' ('any' ['either']) control character, require at least one replacement, i.e. that content (after the field number and indicators) exists; e.g. /records/1049/ (test #259)
if (in_array ('E', $datastructure[$lineNumber]['controlCharacters'])) {
if (!$lineHasContent) {
continue; // i.e. skip this line, preventing registration below
}
}
# Perform string translation on each line
$line = strtr ($line, $replacements);
}
# Determine the key to use for the line output
$i = 0;
$lineOutputKey = $attributes['marcCode'] . '_' . $i++; // Examples: LDR_0, 001_0, 100_0, 650_0
while (isSet ($outputLines[$lineOutputKey])) {
$lineOutputKey = $attributes['marcCode'] . '_' . $i++; // e.g. 650_1 for the second 650 record, 650_2 for the third, etc.
}
# Trim the line, e.g. /records/1054/ (test #261), including multilines (no examples found for tests, but tested by emulated string)
$line = implode ("\n", array_map ('trim', explode ("\n", $line))); // https://stackoverflow.com/a/1655176
# Register the value
$outputLines[$lineOutputKey] = $line;
}
# Insert 880 reciprocal links; see: http://www.lib.cam.ac.uk/libraries/login/documentation/Unicode_non_roman_cataloguing_handout.pdf ; e.g. /records/1062/ has "245 ## 6880-01" and "880 ## 6245-01" (tests #230, #231)
foreach ($this->field880subfield6ReciprocalLinks as $lineOutputKey => $linkToken) { // $lineOutputKey is e.g. 700_0
# Report data mismatches
if (!isSet ($outputLines[$lineOutputKey])) {
$this->errorHtml .= "Line output key {$lineOutputKey} does not exist in the output lines.";
}
# For multilines, split the line into parts, prepend the link token
if (is_array ($linkToken)) {
$lines = explode ("\n", $outputLines[$lineOutputKey]); // Split out
foreach ($lines as $i => $line) {
if (isSet ($linkToken[$i])) {
$lines[$i] = $this->insertSubfieldAfterMarcFieldThenIndicators ($line, $linkToken[$i]);
}
}
$outputLines[$lineOutputKey] = implode ("\n", $lines); // Reassemble; e.g. /records/1697/ (test #262)
# For standard lines, do a simple insertion
} else {
$outputLines[$lineOutputKey] = $this->insertSubfieldAfterMarcFieldThenIndicators ($outputLines[$lineOutputKey], $linkToken); // E.g. /records/1062/ (test #263)
}
}
# Deal with transliteration process note, now that all lines are known to exist, e.g. /records/189648/ (test #1032)
$outputLines = $this->transliterationProcessNote ($outputLines);