|
| 1 | +from datetime import datetime |
| 2 | +from os import times |
| 3 | +from pathlib import Path |
| 4 | +from typing import Dict, Union |
| 5 | + |
| 6 | +import numpy as np |
| 7 | +from lxml import etree |
| 8 | +from stats_arrays.distributions import ( |
| 9 | + LognormalUncertainty, |
| 10 | + NormalUncertainty, |
| 11 | + NoUncertainty, |
| 12 | + TriangularUncertainty, |
| 13 | + UndefinedUncertainty, |
| 14 | + UniformUncertainty, |
| 15 | +) |
| 16 | + |
| 17 | +from .. import __version__ as version |
| 18 | + |
| 19 | +attr_qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "schemaLocation") |
| 20 | +nsmap = { |
| 21 | + None: "http://www.EcoInvent.org/EcoSpold01", |
| 22 | + "xsi": "http://www.w3.org/2001/XMLSchema-instance", |
| 23 | +} |
| 24 | + |
| 25 | +if isinstance(version, tuple): |
| 26 | + version = ".".join([str(x) for x in version]) |
| 27 | + |
| 28 | + |
| 29 | +def bool_to_text(b: Union[bool, str]) -> str: |
| 30 | + if b in (True, "yes", "Yes", "true", "True"): |
| 31 | + return "true" |
| 32 | + elif b in (False, None, "", "False", "false", "No", "no"): |
| 33 | + return "false" |
| 34 | + else: |
| 35 | + raise ValueError(f"Can't convert {b} to boolean string") |
| 36 | + |
| 37 | + |
| 38 | +def stripper(obj: str, prefix: str) -> str: |
| 39 | + if obj.startswith(prefix): |
| 40 | + return obj[len(prefix) :] |
| 41 | + else: |
| 42 | + return obj |
| 43 | + |
| 44 | + |
| 45 | +def pretty_number(val: float) -> str: |
| 46 | + if 1e-2 < abs(val) < 1e2: |
| 47 | + return np.format_float_positional(val, precision=6, trim="0") |
| 48 | + else: |
| 49 | + return np.format_float_scientific(val, precision=6, trim="0") |
| 50 | + |
| 51 | + |
| 52 | +class Ecospold1Exporter: |
| 53 | + """Export one or more datasets to Ecospold1 XML. |
| 54 | +
|
| 55 | + The combination of `Ecospold1DataExtractor` and `Ecospold1Exporter` does |
| 56 | + not give prefect roundtrip data flow, especially if data if coming from |
| 57 | + closed-source software with unspecified behaviour. The following |
| 58 | + differences have been observed: |
| 59 | +
|
| 60 | + * This class uses an updated Ecospold1 set of XSDs from https://github.com/sami-m-g/pyecospold/tree/main/pyecospold/schemas/v1 |
| 61 | + * The dataset `generator` field is different (`bw2io`) |
| 62 | + * The dataset `number` is not preserved |
| 63 | + * Number formatting is different. We round numbers like `10.2000000000000011` to `10.2`, and always keep at least one decimal point. |
| 64 | + * We always include an `uncertaintyType` for exchanges |
| 65 | + * We always include an `infrastructureProcess` for exchanges (default `false`) |
| 66 | + * The field `dataGeneratorAndPublication` is not used consistently - we always fill this with dummy data. |
| 67 | + * We eliminate duplicate identical `person` elements |
| 68 | + * `person` and `source` elements can be renumbered, but references are kept correct |
| 69 | +
|
| 70 | + """ |
| 71 | + |
| 72 | + def __init__(self, schema_location: Union[str, None] = None): |
| 73 | + self.root = etree.Element( |
| 74 | + "ecoSpold", |
| 75 | + { |
| 76 | + attr_qname: schema_location |
| 77 | + or "https://raw.githubusercontent.com/sami-m-g/pyecospold/main/pyecospold/schemas/v1/EcoSpold01Dataset.xsd" |
| 78 | + }, |
| 79 | + nsmap=nsmap, |
| 80 | + ) |
| 81 | + self.count = 0 |
| 82 | + |
| 83 | + def add_dataset(self, node: dict) -> None: |
| 84 | + self.count += 1 |
| 85 | + tags = dict(node.get("tags", [])) |
| 86 | + timestamp = tags.get("ecoSpold01timestamp", datetime.now().isoformat()) |
| 87 | + |
| 88 | + dataset = etree.SubElement( |
| 89 | + self.root, |
| 90 | + "dataset", |
| 91 | + attrib={ |
| 92 | + "validCompanyCodes": "CompanyCodes.xml", |
| 93 | + "validRegionalCodes": "RegionalCodes.xml", |
| 94 | + "validCategories": "Categories.xml", |
| 95 | + "validUnits": "Units.xml", |
| 96 | + # Can't guarantee that datasets come from same source |
| 97 | + # so input numbers aren't useful. |
| 98 | + # We reset the exchange numbers as well. |
| 99 | + # They can't be used in any case as they aren't implemented |
| 100 | + # consistently by different LCA software. |
| 101 | + "number": str(self.count), |
| 102 | + "timestamp": timestamp, |
| 103 | + "generator": f"bw2io {version}", |
| 104 | + }, |
| 105 | + ) |
| 106 | + meta_information = etree.SubElement( |
| 107 | + dataset, |
| 108 | + "metaInformation", |
| 109 | + ) |
| 110 | + |
| 111 | + category = tags.get("ecoSpold01category", "") |
| 112 | + subcategory = tags.get("ecoSpold01subCategory", "") |
| 113 | + comments = node.get("comments", {}) |
| 114 | + |
| 115 | + process_information = etree.SubElement(meta_information, "processInformation") |
| 116 | + etree.SubElement( |
| 117 | + process_information, |
| 118 | + "referenceFunction", |
| 119 | + attrib={ |
| 120 | + "datasetRelatesToProduct": bool_to_text( |
| 121 | + tags.get("ecoSpold01datasetRelatesToProduct", True) |
| 122 | + ), |
| 123 | + "name": node["name"], |
| 124 | + "localName": tags.get("ecoSpold01localName", node["name"]), |
| 125 | + "infrastructureProcess": bool_to_text( |
| 126 | + tags.get("ecoSpold01infrastructureProcess") |
| 127 | + ), |
| 128 | + # This makes no sense, this number is defined in the relevant exchange |
| 129 | + # "Within the ecoinvent quality network the amount of the reference flow always equals 1." |
| 130 | + "amount": "1", |
| 131 | + "unit": node["unit"], |
| 132 | + "category": category, |
| 133 | + "subCategory": subcategory, |
| 134 | + "localCategory": tags.get("ecoSpold01localCategory", category), |
| 135 | + "localSubCategory": tags.get("ecoSpold01localSubCategory", subcategory), |
| 136 | + "includedProcesses": comments.get("includedProcesses", ""), |
| 137 | + "generalComment": comments.get("generalComment", ""), |
| 138 | + "infrastructureIncluded": bool_to_text( |
| 139 | + tags.get("ecoSpold01infrastructureIncluded") |
| 140 | + ), |
| 141 | + }, |
| 142 | + ) |
| 143 | + etree.SubElement( |
| 144 | + process_information, |
| 145 | + "geography", |
| 146 | + attrib={ |
| 147 | + "location": node.get("location", "GLO"), |
| 148 | + "text": stripper(comments.get("location", ""), "Location: "), |
| 149 | + }, |
| 150 | + ) |
| 151 | + etree.SubElement( |
| 152 | + process_information, |
| 153 | + "technology", |
| 154 | + attrib={"text": stripper(comments.get("technology", ""), "Technology: ")}, |
| 155 | + ) |
| 156 | + time_period = etree.SubElement( |
| 157 | + process_information, |
| 158 | + "timePeriod", |
| 159 | + attrib={ |
| 160 | + "text": stripper(comments.get("timePeriod", ""), "Time period: "), |
| 161 | + "dataValidForEntirePeriod": bool_to_text( |
| 162 | + tags.get("ecoSpold01dataValidForEntirePeriod", True) |
| 163 | + ), |
| 164 | + }, |
| 165 | + ) |
| 166 | + start = etree.SubElement(time_period, "startDate") |
| 167 | + start.text = tags.get("ecoSpold01startDate", "1970-01-01") |
| 168 | + end = etree.SubElement(time_period, "endDate") |
| 169 | + end.text = tags.get("ecoSpold01endDate", "1970-01-01") |
| 170 | + etree.SubElement( |
| 171 | + process_information, |
| 172 | + "dataSetInformation", |
| 173 | + attrib={ |
| 174 | + "type": str(tags.get("ecoSpold01type", "1")), |
| 175 | + "impactAssessmentResult": bool_to_text( |
| 176 | + tags.get("ecoSpold01impactAssessmentResult") |
| 177 | + ), |
| 178 | + "timestamp": timestamp, |
| 179 | + "version": tags.get("ecoSpold01version", "0.0"), |
| 180 | + "internalVersion": tags.get("ecoSpold01internalVersion", "0.0"), |
| 181 | + "energyValues": str(tags.get("ecoSpold01energyValues", "0")), |
| 182 | + "languageCode": tags.get("ecoSpold01languageCode", "en"), |
| 183 | + "localLanguageCode": tags.get("ecoSpold01localLanguageCode", "de"), |
| 184 | + }, |
| 185 | + ) |
| 186 | + m_and_v = etree.SubElement(meta_information, "modellingAndValidation") |
| 187 | + etree.SubElement( |
| 188 | + m_and_v, |
| 189 | + "representativeness", |
| 190 | + attrib={ |
| 191 | + "productionVolume": stripper( |
| 192 | + comments.get("productionVolume", "unknown"), "Production volume: " |
| 193 | + ), |
| 194 | + "samplingProcedure": stripper( |
| 195 | + comments.get("sampling", "unknown"), "Sampling: " |
| 196 | + ), |
| 197 | + "extrapolations": stripper( |
| 198 | + comments.get("extrapolations", "unknown"), "Extrapolations: " |
| 199 | + ), |
| 200 | + "uncertaintyAdjustments": stripper( |
| 201 | + comments.get("uncertaintyAdjustments", "unknown"), |
| 202 | + "Uncertainty adjustments: ", |
| 203 | + ), |
| 204 | + }, |
| 205 | + ) |
| 206 | + |
| 207 | + SOURCE_MAP: Dict[str, str] = { |
| 208 | + "Undefined (default)": "0", |
| 209 | + "Article": "1", |
| 210 | + "Chapters in anthology": "2", |
| 211 | + "Seperate publication": "3", |
| 212 | + "Measurement on site": "4", |
| 213 | + "Oral communication": "5", |
| 214 | + "Personal written communication": "6", |
| 215 | + "Questionnaries": "7", |
| 216 | + } |
| 217 | + |
| 218 | + SOURCE_FIELDS = { |
| 219 | + "nameOfEditors": "editors", |
| 220 | + "pageNumbers": "pages", |
| 221 | + "year": "year", |
| 222 | + "title": "title", |
| 223 | + "titleOfAnthology": "anthology", |
| 224 | + "placeOfPublications": "place_of_publication", |
| 225 | + "publisher": "publisher", |
| 226 | + "journal": "journal", |
| 227 | + "volumeNo": "volume", |
| 228 | + "issueNo": "issue", |
| 229 | + "text": "text", |
| 230 | + } |
| 231 | + |
| 232 | + for index, source in enumerate(node.get("references", [])): |
| 233 | + etree.SubElement( |
| 234 | + m_and_v, |
| 235 | + "source", |
| 236 | + attrib={ |
| 237 | + "number": str(source.get("identifier", index + 1)), |
| 238 | + "sourceType": SOURCE_MAP.get(source.get("type"), "0"), |
| 239 | + "firstAuthor": source.get("authors", [""])[0], |
| 240 | + "additionalAuthors": source["authors"][1] |
| 241 | + if len(source.get("authors", [])) > 1 |
| 242 | + else "", |
| 243 | + } |
| 244 | + | { |
| 245 | + k: str(source.get(v)) |
| 246 | + for k, v in SOURCE_FIELDS.items() |
| 247 | + if source.get(v) |
| 248 | + }, |
| 249 | + ) |
| 250 | + |
| 251 | + admin = etree.SubElement(meta_information, "administrativeInformation") |
| 252 | + etree.SubElement( |
| 253 | + admin, |
| 254 | + "dataEntryBy", |
| 255 | + attrib={ |
| 256 | + "number": str(source.get("identifier", index + 1)), |
| 257 | + "qualityNetwork": "1", |
| 258 | + }, |
| 259 | + ) |
| 260 | + etree.SubElement( |
| 261 | + admin, |
| 262 | + "dataGeneratorAndPublication", |
| 263 | + attrib={ |
| 264 | + "person": str( |
| 265 | + node.get("authors", {}).get("data_entry", {}).get("identifier", 1) |
| 266 | + ), |
| 267 | + "dataPublishedIn": "1", |
| 268 | + "referenceToPublishedSource": "1", |
| 269 | + "accessRestrictedTo": "0", |
| 270 | + "copyright": "true", |
| 271 | + }, |
| 272 | + ) |
| 273 | + |
| 274 | + PERSON_FIELDS = [ |
| 275 | + ("identifier", "number", "1"), |
| 276 | + ("address", "address", ""), |
| 277 | + ("company", "companyCode", ""), |
| 278 | + ("country", "countryCode", ""), |
| 279 | + ("email", "email", ""), |
| 280 | + ("name", "name", ""), |
| 281 | + ] |
| 282 | + |
| 283 | + for person in node.get("authors", {}).get("people", []): |
| 284 | + etree.SubElement( |
| 285 | + admin, |
| 286 | + "person", |
| 287 | + attrib={b: str(person.get(a, c)) for a, b, c in PERSON_FIELDS}, |
| 288 | + ) |
| 289 | + |
| 290 | + RESOURCES = { |
| 291 | + "natural resource", |
| 292 | + "natural resources", |
| 293 | + "resource", |
| 294 | + "resources", |
| 295 | + "raw", |
| 296 | + } |
| 297 | + |
| 298 | + UNCERTAINTY_MAPPING = { |
| 299 | + None: "0", |
| 300 | + NoUncertainty.id: "0", |
| 301 | + UndefinedUncertainty.id: "0", |
| 302 | + LognormalUncertainty.id: "1", |
| 303 | + TriangularUncertainty.id: "3", |
| 304 | + UniformUncertainty.id: "4", |
| 305 | + } |
| 306 | + |
| 307 | + EXCHANGE_FIELDS = { |
| 308 | + "generalComment": "comment", |
| 309 | + "CASNumber": "CAS number", |
| 310 | + "location": "location", |
| 311 | + "formula": "chemical formula", |
| 312 | + "referenceToSource": "source_reference", |
| 313 | + "pageNumbers": "pages", |
| 314 | + } |
| 315 | + |
| 316 | + flow_data = etree.SubElement(dataset, "flowData") |
| 317 | + for index, exc in enumerate(node.get("exchanges", [])): |
| 318 | + attrs = { |
| 319 | + "number": str(index + 1), |
| 320 | + "unit": str(exc.get("unit")), |
| 321 | + "name": exc.get("name", ""), |
| 322 | + "meanValue": pretty_number(exc["amount"]), |
| 323 | + "infrastructureProcess": bool_to_text(exc.get("infrastructureProcess")), |
| 324 | + } | {k: exc.get(v) for k, v in EXCHANGE_FIELDS.items() if exc.get(v)} |
| 325 | + |
| 326 | + if exc.get("uncertainty type") is not None: |
| 327 | + attrs["uncertaintyType"] = UNCERTAINTY_MAPPING.get( |
| 328 | + exc.get("uncertainty type") |
| 329 | + ) |
| 330 | + if exc.get("categories") and exc["categories"][0]: |
| 331 | + attrs["category"] = exc["categories"][0] or "" |
| 332 | + if len(exc.get("categories")) > 1 and exc["categories"][1]: |
| 333 | + attrs["subCategory"] = exc["categories"][1] or "" |
| 334 | + |
| 335 | + if exc.get("uncertainty type") == LognormalUncertainty.id and exc.get( |
| 336 | + "scale" |
| 337 | + ): |
| 338 | + attrs["standardDeviation95"] = pretty_number(np.exp(exc["scale"]) ** 2) |
| 339 | + elif exc.get("uncertainty type") == NormalUncertainty.id and exc.get( |
| 340 | + "scale" |
| 341 | + ): |
| 342 | + attrs["standardDeviation95"] = pretty_number(exc["scale"] * 2) |
| 343 | + |
| 344 | + if exc.get("minimum"): |
| 345 | + attrs["minValue"] = pretty_number(exc["minimum"]) |
| 346 | + if exc.get("maximum"): |
| 347 | + attrs["maxValue"] = pretty_number(exc["maximum"]) |
| 348 | + |
| 349 | + exc_element = etree.SubElement( |
| 350 | + flow_data, |
| 351 | + "exchange", |
| 352 | + attrib=attrs, |
| 353 | + ) |
| 354 | + if exc["type"] == "technosphere": |
| 355 | + elem = etree.SubElement(exc_element, "inputGroup") |
| 356 | + elem.text = "5" |
| 357 | + elif exc["type"] == "production": |
| 358 | + elem = etree.SubElement(exc_element, "outputGroup") |
| 359 | + elem.text = "0" |
| 360 | + elif exc["type"] == "substitution": |
| 361 | + elem = etree.SubElement(exc_element, "outputGroup") |
| 362 | + elem.text = "1" |
| 363 | + elif exc["type"] == "biosphere": |
| 364 | + if exc["categories"][0].lower() in RESOURCES: |
| 365 | + elem = etree.SubElement(exc_element, "inputGroup") |
| 366 | + elem.text = "5" |
| 367 | + else: |
| 368 | + elem = etree.SubElement(exc_element, "outputGroup") |
| 369 | + elem.text = "4" |
| 370 | + else: |
| 371 | + raise ValueError("Can't map exchange type {}".format(exc["type"])) |
| 372 | + |
| 373 | + @property |
| 374 | + def bytes(self) -> bytes: |
| 375 | + return etree.tostring( |
| 376 | + self.root, encoding="utf-8", xml_declaration=True, pretty_print=True |
| 377 | + ) |
| 378 | + |
| 379 | + def __repr__(self) -> str: |
| 380 | + return self.bytes.decode("utf-8") |
| 381 | + |
| 382 | + def write_to_file(self, filepath: Path) -> None: |
| 383 | + with open(filepath, "wb") as f: |
| 384 | + f.write(self.bytes) |
0 commit comments