Skip to content

Commit d944718

Browse files
authored
Merge pull request #253 from brightway-lca/ecospold1
Add complete import and ecospold1 export
2 parents 9a2e219 + 2a097d9 commit d944718

File tree

6 files changed

+705
-173
lines changed

6 files changed

+705
-173
lines changed

bw2io/export/ecospold1.py

Lines changed: 384 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,384 @@
1+
from datetime import datetime
2+
from os import times
3+
from pathlib import Path
4+
from typing import Dict, Union
5+
6+
import numpy as np
7+
from lxml import etree
8+
from stats_arrays.distributions import (
9+
LognormalUncertainty,
10+
NormalUncertainty,
11+
NoUncertainty,
12+
TriangularUncertainty,
13+
UndefinedUncertainty,
14+
UniformUncertainty,
15+
)
16+
17+
from .. import __version__ as version
18+
19+
attr_qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "schemaLocation")
20+
nsmap = {
21+
None: "http://www.EcoInvent.org/EcoSpold01",
22+
"xsi": "http://www.w3.org/2001/XMLSchema-instance",
23+
}
24+
25+
if isinstance(version, tuple):
26+
version = ".".join([str(x) for x in version])
27+
28+
29+
def bool_to_text(b: Union[bool, str]) -> str:
30+
if b in (True, "yes", "Yes", "true", "True"):
31+
return "true"
32+
elif b in (False, None, "", "False", "false", "No", "no"):
33+
return "false"
34+
else:
35+
raise ValueError(f"Can't convert {b} to boolean string")
36+
37+
38+
def stripper(obj: str, prefix: str) -> str:
39+
if obj.startswith(prefix):
40+
return obj[len(prefix) :]
41+
else:
42+
return obj
43+
44+
45+
def pretty_number(val: float) -> str:
46+
if 1e-2 < abs(val) < 1e2:
47+
return np.format_float_positional(val, precision=6, trim="0")
48+
else:
49+
return np.format_float_scientific(val, precision=6, trim="0")
50+
51+
52+
class Ecospold1Exporter:
53+
"""Export one or more datasets to Ecospold1 XML.
54+
55+
The combination of `Ecospold1DataExtractor` and `Ecospold1Exporter` does
56+
not give prefect roundtrip data flow, especially if data if coming from
57+
closed-source software with unspecified behaviour. The following
58+
differences have been observed:
59+
60+
* This class uses an updated Ecospold1 set of XSDs from https://github.com/sami-m-g/pyecospold/tree/main/pyecospold/schemas/v1
61+
* The dataset `generator` field is different (`bw2io`)
62+
* The dataset `number` is not preserved
63+
* Number formatting is different. We round numbers like `10.2000000000000011` to `10.2`, and always keep at least one decimal point.
64+
* We always include an `uncertaintyType` for exchanges
65+
* We always include an `infrastructureProcess` for exchanges (default `false`)
66+
* The field `dataGeneratorAndPublication` is not used consistently - we always fill this with dummy data.
67+
* We eliminate duplicate identical `person` elements
68+
* `person` and `source` elements can be renumbered, but references are kept correct
69+
70+
"""
71+
72+
def __init__(self, schema_location: Union[str, None] = None):
73+
self.root = etree.Element(
74+
"ecoSpold",
75+
{
76+
attr_qname: schema_location
77+
or "https://raw.githubusercontent.com/sami-m-g/pyecospold/main/pyecospold/schemas/v1/EcoSpold01Dataset.xsd"
78+
},
79+
nsmap=nsmap,
80+
)
81+
self.count = 0
82+
83+
def add_dataset(self, node: dict) -> None:
84+
self.count += 1
85+
tags = dict(node.get("tags", []))
86+
timestamp = tags.get("ecoSpold01timestamp", datetime.now().isoformat())
87+
88+
dataset = etree.SubElement(
89+
self.root,
90+
"dataset",
91+
attrib={
92+
"validCompanyCodes": "CompanyCodes.xml",
93+
"validRegionalCodes": "RegionalCodes.xml",
94+
"validCategories": "Categories.xml",
95+
"validUnits": "Units.xml",
96+
# Can't guarantee that datasets come from same source
97+
# so input numbers aren't useful.
98+
# We reset the exchange numbers as well.
99+
# They can't be used in any case as they aren't implemented
100+
# consistently by different LCA software.
101+
"number": str(self.count),
102+
"timestamp": timestamp,
103+
"generator": f"bw2io {version}",
104+
},
105+
)
106+
meta_information = etree.SubElement(
107+
dataset,
108+
"metaInformation",
109+
)
110+
111+
category = tags.get("ecoSpold01category", "")
112+
subcategory = tags.get("ecoSpold01subCategory", "")
113+
comments = node.get("comments", {})
114+
115+
process_information = etree.SubElement(meta_information, "processInformation")
116+
etree.SubElement(
117+
process_information,
118+
"referenceFunction",
119+
attrib={
120+
"datasetRelatesToProduct": bool_to_text(
121+
tags.get("ecoSpold01datasetRelatesToProduct", True)
122+
),
123+
"name": node["name"],
124+
"localName": tags.get("ecoSpold01localName", node["name"]),
125+
"infrastructureProcess": bool_to_text(
126+
tags.get("ecoSpold01infrastructureProcess")
127+
),
128+
# This makes no sense, this number is defined in the relevant exchange
129+
# "Within the ecoinvent quality network the amount of the reference flow always equals 1."
130+
"amount": "1",
131+
"unit": node["unit"],
132+
"category": category,
133+
"subCategory": subcategory,
134+
"localCategory": tags.get("ecoSpold01localCategory", category),
135+
"localSubCategory": tags.get("ecoSpold01localSubCategory", subcategory),
136+
"includedProcesses": comments.get("includedProcesses", ""),
137+
"generalComment": comments.get("generalComment", ""),
138+
"infrastructureIncluded": bool_to_text(
139+
tags.get("ecoSpold01infrastructureIncluded")
140+
),
141+
},
142+
)
143+
etree.SubElement(
144+
process_information,
145+
"geography",
146+
attrib={
147+
"location": node.get("location", "GLO"),
148+
"text": stripper(comments.get("location", ""), "Location: "),
149+
},
150+
)
151+
etree.SubElement(
152+
process_information,
153+
"technology",
154+
attrib={"text": stripper(comments.get("technology", ""), "Technology: ")},
155+
)
156+
time_period = etree.SubElement(
157+
process_information,
158+
"timePeriod",
159+
attrib={
160+
"text": stripper(comments.get("timePeriod", ""), "Time period: "),
161+
"dataValidForEntirePeriod": bool_to_text(
162+
tags.get("ecoSpold01dataValidForEntirePeriod", True)
163+
),
164+
},
165+
)
166+
start = etree.SubElement(time_period, "startDate")
167+
start.text = tags.get("ecoSpold01startDate", "1970-01-01")
168+
end = etree.SubElement(time_period, "endDate")
169+
end.text = tags.get("ecoSpold01endDate", "1970-01-01")
170+
etree.SubElement(
171+
process_information,
172+
"dataSetInformation",
173+
attrib={
174+
"type": str(tags.get("ecoSpold01type", "1")),
175+
"impactAssessmentResult": bool_to_text(
176+
tags.get("ecoSpold01impactAssessmentResult")
177+
),
178+
"timestamp": timestamp,
179+
"version": tags.get("ecoSpold01version", "0.0"),
180+
"internalVersion": tags.get("ecoSpold01internalVersion", "0.0"),
181+
"energyValues": str(tags.get("ecoSpold01energyValues", "0")),
182+
"languageCode": tags.get("ecoSpold01languageCode", "en"),
183+
"localLanguageCode": tags.get("ecoSpold01localLanguageCode", "de"),
184+
},
185+
)
186+
m_and_v = etree.SubElement(meta_information, "modellingAndValidation")
187+
etree.SubElement(
188+
m_and_v,
189+
"representativeness",
190+
attrib={
191+
"productionVolume": stripper(
192+
comments.get("productionVolume", "unknown"), "Production volume: "
193+
),
194+
"samplingProcedure": stripper(
195+
comments.get("sampling", "unknown"), "Sampling: "
196+
),
197+
"extrapolations": stripper(
198+
comments.get("extrapolations", "unknown"), "Extrapolations: "
199+
),
200+
"uncertaintyAdjustments": stripper(
201+
comments.get("uncertaintyAdjustments", "unknown"),
202+
"Uncertainty adjustments: ",
203+
),
204+
},
205+
)
206+
207+
SOURCE_MAP: Dict[str, str] = {
208+
"Undefined (default)": "0",
209+
"Article": "1",
210+
"Chapters in anthology": "2",
211+
"Seperate publication": "3",
212+
"Measurement on site": "4",
213+
"Oral communication": "5",
214+
"Personal written communication": "6",
215+
"Questionnaries": "7",
216+
}
217+
218+
SOURCE_FIELDS = {
219+
"nameOfEditors": "editors",
220+
"pageNumbers": "pages",
221+
"year": "year",
222+
"title": "title",
223+
"titleOfAnthology": "anthology",
224+
"placeOfPublications": "place_of_publication",
225+
"publisher": "publisher",
226+
"journal": "journal",
227+
"volumeNo": "volume",
228+
"issueNo": "issue",
229+
"text": "text",
230+
}
231+
232+
for index, source in enumerate(node.get("references", [])):
233+
etree.SubElement(
234+
m_and_v,
235+
"source",
236+
attrib={
237+
"number": str(source.get("identifier", index + 1)),
238+
"sourceType": SOURCE_MAP.get(source.get("type"), "0"),
239+
"firstAuthor": source.get("authors", [""])[0],
240+
"additionalAuthors": source["authors"][1]
241+
if len(source.get("authors", [])) > 1
242+
else "",
243+
}
244+
| {
245+
k: str(source.get(v))
246+
for k, v in SOURCE_FIELDS.items()
247+
if source.get(v)
248+
},
249+
)
250+
251+
admin = etree.SubElement(meta_information, "administrativeInformation")
252+
etree.SubElement(
253+
admin,
254+
"dataEntryBy",
255+
attrib={
256+
"number": str(source.get("identifier", index + 1)),
257+
"qualityNetwork": "1",
258+
},
259+
)
260+
etree.SubElement(
261+
admin,
262+
"dataGeneratorAndPublication",
263+
attrib={
264+
"person": str(
265+
node.get("authors", {}).get("data_entry", {}).get("identifier", 1)
266+
),
267+
"dataPublishedIn": "1",
268+
"referenceToPublishedSource": "1",
269+
"accessRestrictedTo": "0",
270+
"copyright": "true",
271+
},
272+
)
273+
274+
PERSON_FIELDS = [
275+
("identifier", "number", "1"),
276+
("address", "address", ""),
277+
("company", "companyCode", ""),
278+
("country", "countryCode", ""),
279+
("email", "email", ""),
280+
("name", "name", ""),
281+
]
282+
283+
for person in node.get("authors", {}).get("people", []):
284+
etree.SubElement(
285+
admin,
286+
"person",
287+
attrib={b: str(person.get(a, c)) for a, b, c in PERSON_FIELDS},
288+
)
289+
290+
RESOURCES = {
291+
"natural resource",
292+
"natural resources",
293+
"resource",
294+
"resources",
295+
"raw",
296+
}
297+
298+
UNCERTAINTY_MAPPING = {
299+
None: "0",
300+
NoUncertainty.id: "0",
301+
UndefinedUncertainty.id: "0",
302+
LognormalUncertainty.id: "1",
303+
TriangularUncertainty.id: "3",
304+
UniformUncertainty.id: "4",
305+
}
306+
307+
EXCHANGE_FIELDS = {
308+
"generalComment": "comment",
309+
"CASNumber": "CAS number",
310+
"location": "location",
311+
"formula": "chemical formula",
312+
"referenceToSource": "source_reference",
313+
"pageNumbers": "pages",
314+
}
315+
316+
flow_data = etree.SubElement(dataset, "flowData")
317+
for index, exc in enumerate(node.get("exchanges", [])):
318+
attrs = {
319+
"number": str(index + 1),
320+
"unit": str(exc.get("unit")),
321+
"name": exc.get("name", ""),
322+
"meanValue": pretty_number(exc["amount"]),
323+
"infrastructureProcess": bool_to_text(exc.get("infrastructureProcess")),
324+
} | {k: exc.get(v) for k, v in EXCHANGE_FIELDS.items() if exc.get(v)}
325+
326+
if exc.get("uncertainty type") is not None:
327+
attrs["uncertaintyType"] = UNCERTAINTY_MAPPING.get(
328+
exc.get("uncertainty type")
329+
)
330+
if exc.get("categories") and exc["categories"][0]:
331+
attrs["category"] = exc["categories"][0] or ""
332+
if len(exc.get("categories")) > 1 and exc["categories"][1]:
333+
attrs["subCategory"] = exc["categories"][1] or ""
334+
335+
if exc.get("uncertainty type") == LognormalUncertainty.id and exc.get(
336+
"scale"
337+
):
338+
attrs["standardDeviation95"] = pretty_number(np.exp(exc["scale"]) ** 2)
339+
elif exc.get("uncertainty type") == NormalUncertainty.id and exc.get(
340+
"scale"
341+
):
342+
attrs["standardDeviation95"] = pretty_number(exc["scale"] * 2)
343+
344+
if exc.get("minimum"):
345+
attrs["minValue"] = pretty_number(exc["minimum"])
346+
if exc.get("maximum"):
347+
attrs["maxValue"] = pretty_number(exc["maximum"])
348+
349+
exc_element = etree.SubElement(
350+
flow_data,
351+
"exchange",
352+
attrib=attrs,
353+
)
354+
if exc["type"] == "technosphere":
355+
elem = etree.SubElement(exc_element, "inputGroup")
356+
elem.text = "5"
357+
elif exc["type"] == "production":
358+
elem = etree.SubElement(exc_element, "outputGroup")
359+
elem.text = "0"
360+
elif exc["type"] == "substitution":
361+
elem = etree.SubElement(exc_element, "outputGroup")
362+
elem.text = "1"
363+
elif exc["type"] == "biosphere":
364+
if exc["categories"][0].lower() in RESOURCES:
365+
elem = etree.SubElement(exc_element, "inputGroup")
366+
elem.text = "5"
367+
else:
368+
elem = etree.SubElement(exc_element, "outputGroup")
369+
elem.text = "4"
370+
else:
371+
raise ValueError("Can't map exchange type {}".format(exc["type"]))
372+
373+
@property
374+
def bytes(self) -> bytes:
375+
return etree.tostring(
376+
self.root, encoding="utf-8", xml_declaration=True, pretty_print=True
377+
)
378+
379+
def __repr__(self) -> str:
380+
return self.bytes.decode("utf-8")
381+
382+
def write_to_file(self, filepath: Path) -> None:
383+
with open(filepath, "wb") as f:
384+
f.write(self.bytes)

0 commit comments

Comments
 (0)