-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patherm_split.pl
More file actions
216 lines (172 loc) · 6.28 KB
/
erm_split.pl
File metadata and controls
216 lines (172 loc) · 6.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#!/usr/bin/perl
###
### erm_split.pl - Split the ERM import file into smaller parts
###
### Early versiosn of ERM had problems importing more than 4,000 or 5,000
### records at a time. This script splits the ERM import file into smaller
### parts, either by SFX target, or by a fixed number of records.
###
###
### Darryl Friesen (Darryl.Friesen@usask.ca)
### University of Saskatchewan Library
###
### Version 3.22
###
use strict;
use Getopt::Long;
my $VERSION = 'SFX-Tools/3.22 ERM File Splitter';
### Default config file is a file named 'sfx-tools.conf' located in the same directory as the scripts.
### We first need to find out what directory the scripts are being run from
use FindBin qw($Bin);
use lib "$Bin/../lib";
my $ConfigFile = $Bin . '/sfx-tools.conf';
###
### Get command line options
###
my $DEBUG = 0;
my $Target = -1;
my $Count = -1;
### Allowable command line arguments
my %options = (
'config' => \$ConfigFile,
'target' => \$Target,
'count' => \$Count,
'debug' => \$DEBUG
);
GetOptions(\%options, 'config=s', 'target', 'count:5000', 'debug+');
###
### Read config file
###
open(CONFIG, $ConfigFile) || die('Unable to open configuration file ' . $ConfigFile . ': ' . $! . "\n");
####
my %Config = (
'DEBUG_LEVEL' => 0,
'ERM_SPLIT' => 'NONE',
'ERM_SPLIT_RECORDS' => 5000
);
while (<CONFIG>) {
chomp; # no newline
s/#.*//; # no comments
s/^\s+//; # no leading white
s/\s+$//; # no trailing white
next unless length; # anything left?
my ($var, $value) = split(/\s*=\s*/, $_, 2);
$Config{uc($var)} = $value;
}
### Command-line DEBUG value should override the one from the config file
$Config{'DEBUG_LEVEL'} = $DEBUG if ($DEBUG > 0);
print $VERSION . "\n" if ($Config{'DEBUG_LEVEL'});
if ($Target > 0) {
$Config{'ERM_SPLIT'} = 'TARGET';
}
if ($Count > 0) {
$Count = 100 if ($Count < 100);
$Config{'ERM_SPLIT'} = 'RECORDS';
$Config{'ERM_SPLIT_RECORDS'} = $Count;
}
#print '$Config{ERM_SPLIT} = ' . $Config{'ERM_SPLIT'} . "\n";
#print '$Config{ERM_SPLIT_RECORDS} = ' . $Config{'ERM_SPLIT_RECORDS'} . "\n";
exit if ($Config{'ERM_SPLIT'} eq 'NONE');
### To keep the directory structure clean, we'll put all the split files
### into their own directory, erm_data.
### Make a directory for the data if it doesn't exist
if (!-e $Config{'DATA_DIR'} . '/erm_data') {
die("Failed to create the erm_data directory: !$\n") if (!mkdir('erm_data', 0775));
}
### Cleanup any old files
unlink glob($Config{'DATA_DIR'} . '/erm_data/*.txt');
### METHOD 1: Split file based on number of records;
if ($Config{'ERM_SPLIT'} eq 'RECORDS') {
my $file_num = 1;
my $count = 0;
### Open the erm_data.txt file
open(IN, $Config{'DATA_DIR'} . '/erm_data.txt') || die("Unable to open " . $Config{'DATA_DIR'} . "/erm_data.txt file: $!\n");
### Open the first output file
my $filename = $Config{'DATA_DIR'} . '/erm_data/erm.' . sprintf("%04d", $file_num) . '.txt';
open(ERM, '>' . $filename) || die("Unable to open output file $filename: $!\n");
my $first = 1;
my $Header = '';
while (<IN>) {
### Carry forward the header (on the first line) from the original file
if ($first) {
$first = 0;
$Header = $_;
print ERM $Header;
next;
}
chomp;
next if ($_ eq '');
### Increment our line/record counter
$count++;
### If we pass the maximum record limit, close the current file and start a new one
if ($count > $Config{'ERM_SPLIT_RECORDS'}) {
close(ERM);
$count = 1;
$file_num++;
$filename = $Config{'DATA_DIR'} . '/erm_data/erm.' . sprintf("%04d", $file_num) . '.txt';
open(ERM, '>' . $filename) || die("Unable to open output file $filename: $!\n");
print ERM $Header;
}
### Write the record to the output file
print ERM $_ . "\n";
}
### close the 2 open files
close(ERM);
close(IN);
}
### METHOD 2: Split file based on SFX target
###
### This is a slightly more complicated method as the entire input file
### must be read in, sorted by target, then dumped out.
if ($Config{'ERM_SPLIT'} eq 'TARGET') {
my %erm_data;
### Open the erm_data.txt file
open(IN, $Config{'DATA_DIR'} . '/erm_data.txt') || die("Unable to open " . $Config{'DATA_DIR'} . "/erm_data.txt file: $!\n");
### Read the entire file into an associative array using target (and record counter) as the key
my $first = 1;
my $i = 0;
my $Header = '';
while (<IN>) {
### Carry forward the header (on the first line) from the original file
if ($first) {
$first = 0;
$Header = $_;
next;
}
chomp;
next if ($_ eq '');
### split the line
my (@elements) = split(/\|/);
### Increment our line/record counter
$i++;
### add the data to our associative array (SFX target is the 6th element)
my $key = $elements[5];
$key =~ s#[\(\)\[\]:'"\.]##g;
$key =~ s#[\s\@\\\/]#_#g;
$key .= '::' . sprintf("%05d", $i);
$erm_data{$key} = $_;
}
close(IN);
### Sort the data by Target and write each target to a file
my $target = '';
my $file_open = 0;
foreach my $key (sort keys(%erm_data)) {
my ($t, $i) = split(/::/, $key);
### Open a new output file when we encounter a different target
if ($t ne $target) {
### Close the previous target's output file first (if it is open)
if ($file_open) {
close(ERM);
$file_open = 0;
}
$target = $t;
my $filename = $Config{'DATA_DIR'} . '/erm_data/erm.' . $target . '.txt';
open(ERM, '>' . $filename) || die("Unable to open output file $filename: $!\n");
print ERM $Header;
$file_open = 1;
}
### Write the record to the output file
print ERM $erm_data{$key} . "\n";
}
close(ERM) if ($file_open);
}