-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdkan_harvest_plus.module
183 lines (162 loc) · 5.55 KB
/
dkan_harvest_plus.module
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
<?php
/**
* @file
* Support for non compatible data.json endpoints for dkan_harvest. The end
* points give raw data directly.
*/
/**
* Implements hook_harvest_source_types().
*/
function dkan_harvest_plus_harvest_source_types() {
return array(
'xml_data_endpoint' => array(
'machine_name' => 'xml_data_endpoint',
'label' => 'XML data endpoint',
'cache callback' => 'dkan_harvest_plus_xmldata_cache',
'migration class' => 'DatajsonHarvestMigration',
),
);
}
/**
* Cache callback for raw data endpoints.
*
*
* @param HarvestSource $source
* The source object from a datajson endpoint to cache.
* @param int $harvest_updatetime
* Last harvest update time.
*
* @return HarvestCache
* A harvest cache object.
*/
function dkan_harvest_plus_xmldata_cache(HarvestSource $source, $harvest_updatetime) {
// This is needed for remote uri.
$context = stream_context_create(
array(
'http' => array(
'timeout' => 36000,
),
'https' => array(
'timeout' => 36000,
),
)
);
$remote_data = @file_get_contents($source->uri, 0, $context);
if ($remote_data) {
$harvestCache = dkan_harvest_plus_cache_xml_to_json($remote_data, $source, $harvest_updatetime);
return $harvestCache;
}
}
/**
* Cache the datasets. This function attempts to convert the data endpoint to POD
* format dataset json file. It achieves this by
*
* 1. Getting the data from remote source.
* 2. Converting it into csv file and save it under public://dkan-plus-dataset
* directory with the identifier name as csv filename.
* 3. This file path is used as "downloadURL" in json schema file and with appropriate
* "mediaType" and "type" values.
* 4. The schema templates are stored under "dataset_templates" directory in this
* module. This template should have same name as $identifier in the code which
* is machine name of the source with a json extension.
* 5. This template has all the metadata for creating dataset since only data is
* provided by the endpoint. So new template needs to be created for every dataset.
* 6. We get this template , replace all placeholders and write back all the json
* data to Harvest cache directory "dkan-harvest-cache".
*
* @param string $data
* Harvested data.
* @param HarvestSource $source
* Harvest source instance.
* @param int $harvest_updatetime
* Last harvest update time.
*
* @return HarvestCache
* HarvestCache object
*/
function dkan_harvest_plus_cache_xml_to_json($data, HarvestSource $source, $harvest_updatetime) {
$harvest_cache = new HarvestCache($source, $harvest_updatetime);
$remote_xml_data = $data;
$cached = FALSE;
$identifier = $source->machineName;
$harvest_dataset_file = implode('/', array(
$source->getCacheDir(),
$identifier
));
// We need common folder to save all datasets from dkan plus enabled sources.
$dkan_plus_dataset_dir = 'public://dkan-plus-datasets';
if (!is_dir($dkan_plus_dataset_dir)) {
mkdir($dkan_plus_dataset_dir, 0777);
}
$csv_file = implode('/', array($dkan_plus_dataset_dir, $identifier . '.csv'));
if (!empty($remote_xml_data)) {
$xml = simplexml_load_string($remote_xml_data);
$f = fopen($csv_file, 'a');
createCsv($xml, $f);
fclose($f);
}
// We get default template and replace values since we don't get metadata and
// endpoint doesn't provide data in POD format.
$module_path = drupal_get_path('module', 'dkan_harvest_plus');
$dataset_file = $identifier . '.json';
$dataset_template = implode('/', array(
$module_path,
'dataset_templates',
$dataset_file
));
$default_json_dataset = @file_get_contents($dataset_template);
if ($default_json_dataset) {
$replacements = array(
'@download_url' => file_create_url($csv_file),
'@media_type' => 'text\/csv',
'@format' => 'csv',
'@issued_date' => date_now(),
);
$final_dataset = strtr($default_json_dataset, $replacements);
$cached = @file_put_contents($harvest_dataset_file, $final_dataset);
}
if (!$cached) {
$harvest_cache->setCacheEntryFailed($identifier, $source->label);
}
else {
// This will reset the failed flag if previously set.
$harvest_cache->setCacheEntryProcessed($identifier, $source->label);
}
return $harvest_cache;
}
function createCsv($xml, $f) {
fwrite($f, "\xEF\xBB\xBF");
$header = FALSE;
foreach ($xml as $k => $details) {
if (!$header) {
$csv_line = arrayToCsv(array_keys(get_object_vars($details)), ',', '"', TRUE, FALSE);
fwrite($f, $csv_line . "\r\n");
$header = TRUE;
}
$csv_line = arrayToCsv(get_object_vars($details), ',', '"', TRUE, FALSE);
fwrite($f, $csv_line . "\r\n");
}
}
/**
* Formats a line (passed as a fields array) as CSV and returns the CSV as a string.
* Adapted from http://us3.php.net/manual/en/function.fputcsv.php#87120
*/
function arrayToCsv(array &$fields, $delimiter = ';', $enclosure = '"', $encloseAll = FALSE, $nullToMysqlNull = FALSE) {
$delimiter_esc = preg_quote($delimiter, '/');
$enclosure_esc = preg_quote($enclosure, '/');
$output = array();
foreach ($fields as $field) {
if ($field === NULL && $nullToMysqlNull) {
$output[] = 'NULL';
continue;
}
// Enclose fields containing $delimiter, $enclosure or whitespace
if ($encloseAll || preg_match(" / (?:${delimiter_esc} | ${enclosure_esc} | \s)/", $field)) {
$output[] = $enclosure . str_replace($enclosure, $enclosure . $enclosure, $field) . $enclosure;
}
else {
$output[] = $field;
}
}
return implode($delimiter, $output);
}