-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcsvp.ml
383 lines (333 loc) · 13.5 KB
/
csvp.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
type field = string;;
type record = field array;;
exception Bad_parameters;;
exception Bad_syntax of string;;
let default_fields_separator = ',';;
let default_field_quoter = '"';;
let default_records_separator = '\n';;
let initial_field_buffer_length = 64;;
let initial_record_buffer_length = 16;;
type csv_parameters = {
compressed : bool;
quote_all : bool;
field_quoter : char;
fields_separator : char;
records_separator : char;
};;
let make_csv_parameters
?(compressed = true)
?(quote_all = false)
?(field_quoter = default_field_quoter)
?(fields_separator = default_fields_separator)
?(records_separator = default_records_separator)
() =
if field_quoter = fields_separator ||
field_quoter = records_separator ||
fields_separator = records_separator then
raise Bad_parameters
else
{ compressed; quote_all; fields_separator; field_quoter;
records_separator;
};;
(****************************************************************)
type csv_status_common = {
parameters : csv_parameters;
mutable record_buffer_length : int;
mutable record_buffer : field array;
mutable current_field_number : int;
mutable current_record_number : int;
};;
(****************************************************************)
(* The Writer Status *)
type csv_writer_status = {
out_channel : out_channel;
common : csv_status_common;
};;
let make_csv_writer_status_from_channel parameters out_channel =
{ out_channel;
common = {
parameters;
current_field_number = 0;
record_buffer_length = initial_record_buffer_length;
record_buffer = Array.make initial_record_buffer_length "";
current_record_number = 0;
}
};;
let make_csv_writer_status_from_filename parameters filename =
make_csv_writer_status_from_channel parameters (open_out_bin filename);;
let finished_with_this_csv_writer writer_status =
close_out writer_status.out_channel;
writer_status.common.record_buffer <- [||];;
(****************************************************************)
(* The Reader Status *)
type csv_reader_status = {
in_channel : in_channel;
common : csv_status_common;
mutable field_buffer_length : int;
mutable field_buffer : Bytes.t;
mutable field_buffer_fill_pointer : int;
mutable record_buffer_fill_pointer : int;
mutable record_at_end : bool;
mutable input_at_end : bool
};;
let make_csv_reader_status_from_channel parameters in_channel =
{ in_channel;
common = {
parameters;
record_buffer_length = initial_record_buffer_length;
record_buffer = Array.make initial_record_buffer_length "";
current_field_number = 0;
current_record_number = 0;
};
field_buffer_length = initial_field_buffer_length;
field_buffer = Bytes.create initial_field_buffer_length;
field_buffer_fill_pointer = 0;
record_buffer_fill_pointer = 0;
record_at_end = false;
input_at_end = false;
};;
let make_csv_reader_status_from_filename parameters filename =
make_csv_reader_status_from_channel parameters (open_in_bin filename);;
let no_bytes = Bytes.create 0;;
let finished_with_this_csv_reader reader_status =
close_in reader_status.in_channel;
reader_status.field_buffer <- no_bytes;
reader_status.common.record_buffer <- [||];;
(****************************************************************)
(* Buffers Management *)
let grow_array array len default =
let new_len = 2 * len + 1 in
let new_array = Array.make new_len default in
Array.blit array 0 new_array 0 len;
(new_array, new_len);;
(*
let grow_bytes bytes len =
let new_len = 2 * len + 1 in
let new_bytes = Bytes.create new_len in
Bytes.blit bytes 0 new_bytes 0 len;
(new_bytes, new_len);;
*)
let grow_bytes bytes len =
let additional_space = len + 1 in
(Bytes.extend bytes 0 additional_space, len + additional_space);;
let grow_record_buffer common_status =
let record_buffer', record_buffer_length' =
grow_array common_status.record_buffer common_status.record_buffer_length "" in
common_status.record_buffer <- record_buffer';
common_status.record_buffer_length <- record_buffer_length';;
let grow_field_buffer reader_status =
let field_buffer', field_buffer_length' =
grow_bytes reader_status.field_buffer reader_status.field_buffer_length in
reader_status.field_buffer <- field_buffer';
reader_status.field_buffer_length <- field_buffer_length';;
(****************************************************************)
(* Default Fields in Compressed Records *)
let set_record_default common field =
if common.current_field_number = common.record_buffer_length then begin
grow_record_buffer common
end;
common.record_buffer.(common.current_field_number) <- field;;
let get_record_default { record_buffer;
record_buffer_length;
current_field_number } =
if current_field_number >= record_buffer_length then ""
else record_buffer.(current_field_number);;
(****************************************************************)
(* The Writer *)
let needs_quoting parameters field =
(* This function is called only when is known
* that field <> buffered field... *)
let len = String.length field in
if len = 0 then begin
(* ...hence when compressed, len = 0 => needs_quoting *)
parameters.compressed
end else if field.[0] = parameters.field_quoter then true
else let rec loop i =
if i = len then false
else let c = field.[i] in
c = parameters.fields_separator ||
c = parameters.records_separator ||
loop (succ i)
in loop 0;;
let write_quoted_field out_channel field_quoter field =
output_char out_channel field_quoter;
for i = 0 to String.length field - 1 do
let c = field.[i] in
output_char out_channel c;
if c = field_quoter then begin
output_char out_channel field_quoter
end
done;
output_char out_channel field_quoter;;
let write_field writer_status field =
let { out_channel; common } = writer_status in
if common.current_field_number >= common.record_buffer_length then begin
grow_record_buffer common
end;
let { current_field_number; record_buffer;
record_buffer_length; parameters } = common in
let { compressed; quote_all; fields_separator; field_quoter } = parameters in
if current_field_number > 0 then begin
output_char out_channel fields_separator
end;
if not compressed || not (field = record_buffer.(current_field_number)) then begin
if quote_all || needs_quoting parameters field then begin
write_quoted_field out_channel field_quoter field;
end else begin
output_string out_channel field
end;
record_buffer.(current_field_number) <- field
end;
common.current_field_number <- succ current_field_number;;
let new_record writer_status =
let { out_channel; common } = writer_status in
output_char out_channel common.parameters.records_separator;
common.current_field_number <- 0;;
let write_field_list writer_status list =
List.iter (fun field -> write_field writer_status field) list;
new_record writer_status;;
let write_record_array writer_status array =
for i = 0 to Array.length array - 1 do
write_field writer_status array.(i)
done;
new_record writer_status;;
let write_record = write_record_array;;
(****************************************************************)
let artifex = "PETRVS·PAVLVS·NEPTVNENSIS·ME·FECIT·MMXVI";;
(****************************************************************)
(* The Reader *)
let reset_field_buffer reader_status =
reader_status.field_buffer_fill_pointer <- 0;;
let field_accumulate_char reader_status char =
if reader_status.field_buffer_fill_pointer = reader_status.field_buffer_length then begin
grow_field_buffer reader_status
end;
Bytes.set reader_status.field_buffer reader_status.field_buffer_fill_pointer char;
reader_status.field_buffer_fill_pointer <- succ reader_status.field_buffer_fill_pointer;;
let extract_field_buffer_content reader_status =
Bytes.sub_string reader_status.field_buffer 0 reader_status.field_buffer_fill_pointer;;
let reset_record_buffer (reader_status : csv_reader_status) =
reader_status.common.current_field_number <- 0;
reader_status.record_buffer_fill_pointer <- 0;;
let record_accumulate_field reader_status field =
let { common; record_buffer_fill_pointer } = reader_status in
if record_buffer_fill_pointer = common.record_buffer_length then begin
grow_record_buffer common
end;
common.record_buffer.(reader_status.record_buffer_fill_pointer) <- field;
reader_status.record_buffer_fill_pointer <- succ reader_status.record_buffer_fill_pointer;;
let extract_record_buffer_content (reader_status : csv_reader_status) =
Array.sub reader_status.common.record_buffer 0 reader_status.record_buffer_fill_pointer;;
let read_unquoted_field reader_status c0 =
let { in_channel; common } = reader_status in
let { records_separator; fields_separator } = common.parameters in
let rec loop c0 =
field_accumulate_char reader_status c0;
match input_char in_channel with
| exception End_of_file ->
reader_status.record_at_end <- true;
reader_status.input_at_end <- true;
extract_field_buffer_content reader_status
| c when c = records_separator ->
reader_status.record_at_end <- true;
extract_field_buffer_content reader_status
| c when c = fields_separator ->
extract_field_buffer_content reader_status
| c -> loop c
in loop c0;;
let read_quoted_field reader_status =
(* The opening quote has already been consumed. *)
let { in_channel; common } = reader_status in
let { field_quoter; fields_separator; records_separator } = common.parameters in
let rec loop () =
match input_char in_channel with
| exception End_of_file ->
raise (Bad_syntax (Printf.sprintf
"Data ended while inside a quoted field at record %d, field %d"
common.current_record_number
common.current_field_number))
| c0 when c0 = field_quoter ->
begin match input_char in_channel with
| exception End_of_file ->
reader_status.record_at_end <- true;
reader_status.input_at_end <- true;
extract_field_buffer_content reader_status
| c1 when c1 = field_quoter ->
field_accumulate_char reader_status c1;
loop ()
| c1 when c1 = fields_separator ->
extract_field_buffer_content reader_status
| c1 when c1 = records_separator ->
reader_status.record_at_end <- true;
extract_field_buffer_content reader_status
| _ ->
raise (Bad_syntax (Printf.sprintf
"Extraneous character after closing quote at record %d, field %d"
common.current_record_number
common.current_field_number))
end
| c0 -> field_accumulate_char reader_status c0;
loop ()
in loop ();;
let read_field reader_status =
if reader_status.record_at_end then begin
reader_status.record_at_end <- false;
reset_record_buffer reader_status;
reader_status.common.current_field_number <- 0;
None
end else begin
let { in_channel; common } = reader_status in
let { parameters } = common in
reset_field_buffer reader_status;
let field = match input_char in_channel with
| exception End_of_file ->
reader_status.record_at_end <- true;
reader_status.input_at_end <- true;
None
| c0 when c0 = parameters.fields_separator ->
if parameters.compressed then Some (get_record_default common)
else Some ""
| c0 when c0 = parameters.records_separator ->
reader_status.record_at_end <- true;
if parameters.compressed then Some (get_record_default common)
else Some ""
| c0 when c0 = parameters.field_quoter ->
let field = read_quoted_field reader_status in
set_record_default common field;
Some field
| c0 ->
let field = read_unquoted_field reader_status c0 in
set_record_default common field;
Some field
in if reader_status.input_at_end then begin
finished_with_this_csv_reader reader_status;
end else begin
common.current_field_number <- succ common.current_field_number;
end;
field
end;;
let input_at_end { input_at_end } = input_at_end;;
let read_record reader_status =
let ({ common } : csv_reader_status) = reader_status in
if reader_status.input_at_end then begin
None
end else begin
reset_record_buffer reader_status;
reader_status.record_at_end <- false;
let record =
let rec loop () =
match read_field reader_status with
| Some field ->
record_accumulate_field reader_status field;
if reader_status.record_at_end then
Some (extract_record_buffer_content reader_status)
else begin
loop ()
end
| None ->
if reader_status.record_buffer_fill_pointer = 0 then None
else Some (extract_record_buffer_content reader_status)
in loop ()
in common.current_record_number <- succ common.current_record_number;
record
end;;