Skip to content

Commit

Permalink
Added a pollutant mapping table to fix inconsistencies in the naming …
Browse files Browse the repository at this point in the history
…of pollutants. This is especially useful for element abbreviation: Al -> Aluminum. Also fixed a couple column headers that were being accidentally detected as pollutants.
  • Loading branch information
dbeaudoinfortin committed Jun 10, 2024
1 parent 9fd4308 commit 4065c2d
Show file tree
Hide file tree
Showing 4 changed files with 154 additions and 9 deletions.
140 changes: 140 additions & 0 deletions src/main/java/com/dbf/naps/data/globals/PollutantMapping.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
package com.dbf.naps.data.globals;

import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

public class PollutantMapping {
private static final Map<String, String> POLLUTANT_LOOKUP = new ConcurrentHashMap<String, String>();
static {
POLLUTANT_LOOKUP.put("H", "Hydrogen");
POLLUTANT_LOOKUP.put("He", "Helium");
POLLUTANT_LOOKUP.put("Li", "Lithium");
POLLUTANT_LOOKUP.put("Be", "Beryllium");
POLLUTANT_LOOKUP.put("B", "Boron");
POLLUTANT_LOOKUP.put("C", "Carbon");
POLLUTANT_LOOKUP.put("N", "Nitrogen");
POLLUTANT_LOOKUP.put("O", "Oxygen");
POLLUTANT_LOOKUP.put("F", "Fluorine");
POLLUTANT_LOOKUP.put("Ne", "Neon");
POLLUTANT_LOOKUP.put("Na", "Sodium");
POLLUTANT_LOOKUP.put("Mg", "Magnesium");
POLLUTANT_LOOKUP.put("Al", "Aluminum");
POLLUTANT_LOOKUP.put("Si", "Silicon");
POLLUTANT_LOOKUP.put("P", "Phosphorus");
POLLUTANT_LOOKUP.put("S", "Sulfur");
POLLUTANT_LOOKUP.put("Cl", "Chlorine");
POLLUTANT_LOOKUP.put("Ar", "Argon");
POLLUTANT_LOOKUP.put("K", "Potassium");
POLLUTANT_LOOKUP.put("Ca", "Calcium");
POLLUTANT_LOOKUP.put("Sc", "Scandium");
POLLUTANT_LOOKUP.put("Ti", "Titanium");
POLLUTANT_LOOKUP.put("V", "Vanadium");
POLLUTANT_LOOKUP.put("Cr", "Chromium");
POLLUTANT_LOOKUP.put("Mn", "Manganese");
POLLUTANT_LOOKUP.put("Fe", "Iron");
POLLUTANT_LOOKUP.put("Co", "Cobalt");
POLLUTANT_LOOKUP.put("Ni", "Nickel");
POLLUTANT_LOOKUP.put("Cu", "Copper");
POLLUTANT_LOOKUP.put("Zn", "Zinc");
POLLUTANT_LOOKUP.put("Ga", "Gallium");
POLLUTANT_LOOKUP.put("Ge", "Germanium");
POLLUTANT_LOOKUP.put("As", "Arsenic");
POLLUTANT_LOOKUP.put("Se", "Selenium");
POLLUTANT_LOOKUP.put("Br", "Bromine");
POLLUTANT_LOOKUP.put("Kr", "Krypton");
POLLUTANT_LOOKUP.put("Rb", "Rubidium");
POLLUTANT_LOOKUP.put("Sr", "Strontium");
POLLUTANT_LOOKUP.put("Y", "Yttrium");
POLLUTANT_LOOKUP.put("Zr", "Zirconium");
POLLUTANT_LOOKUP.put("Nb", "Niobium");
POLLUTANT_LOOKUP.put("Mo", "Molybdenum");
POLLUTANT_LOOKUP.put("Tc", "Technetium");
POLLUTANT_LOOKUP.put("Ru", "Ruthenium");
POLLUTANT_LOOKUP.put("Rh", "Rhodium");
POLLUTANT_LOOKUP.put("Pd", "Palladium");
POLLUTANT_LOOKUP.put("Ag", "Silver");
POLLUTANT_LOOKUP.put("Cd", "Cadmium");
POLLUTANT_LOOKUP.put("In", "Indium");
POLLUTANT_LOOKUP.put("Sn", "Tin");
POLLUTANT_LOOKUP.put("Sb", "Antimony");
POLLUTANT_LOOKUP.put("Te", "Tellurium");
POLLUTANT_LOOKUP.put("I", "Iodine");
POLLUTANT_LOOKUP.put("Xe", "Xenon");
POLLUTANT_LOOKUP.put("Cs", "Cesium");
POLLUTANT_LOOKUP.put("Ba", "Barium");
POLLUTANT_LOOKUP.put("La", "Lanthanum");
POLLUTANT_LOOKUP.put("Ce", "Cerium");
POLLUTANT_LOOKUP.put("Pr", "Praseodymium");
POLLUTANT_LOOKUP.put("Nd", "Neodymium");
POLLUTANT_LOOKUP.put("Pm", "Promethium");
POLLUTANT_LOOKUP.put("Sm", "Samarium");
POLLUTANT_LOOKUP.put("Eu", "Europium");
POLLUTANT_LOOKUP.put("Gd", "Gadolinium");
POLLUTANT_LOOKUP.put("Tb", "Terbium");
POLLUTANT_LOOKUP.put("Dy", "Dysprosium");
POLLUTANT_LOOKUP.put("Ho", "Holmium");
POLLUTANT_LOOKUP.put("Er", "Erbium");
POLLUTANT_LOOKUP.put("Tm", "Thulium");
POLLUTANT_LOOKUP.put("Yb", "Ytterbium");
POLLUTANT_LOOKUP.put("Lu", "Lutetium");
POLLUTANT_LOOKUP.put("Hf", "Hafnium");
POLLUTANT_LOOKUP.put("Ta", "Tantalum");
POLLUTANT_LOOKUP.put("W", "Tungsten");
POLLUTANT_LOOKUP.put("Re", "Rhenium");
POLLUTANT_LOOKUP.put("Os", "Osmium");
POLLUTANT_LOOKUP.put("Ir", "Iridium");
POLLUTANT_LOOKUP.put("Pt", "Platinum");
POLLUTANT_LOOKUP.put("Au", "Gold");
POLLUTANT_LOOKUP.put("Hg", "Mercury");
POLLUTANT_LOOKUP.put("Tl", "Thallium");
POLLUTANT_LOOKUP.put("Pb", "Lead");
POLLUTANT_LOOKUP.put("Bi", "Bismuth");
POLLUTANT_LOOKUP.put("Po", "Polonium");
POLLUTANT_LOOKUP.put("At", "Astatine");
POLLUTANT_LOOKUP.put("Rn", "Radon");
POLLUTANT_LOOKUP.put("Fr", "Francium");
POLLUTANT_LOOKUP.put("Ra", "Radium");
POLLUTANT_LOOKUP.put("Ac", "Actinium");
POLLUTANT_LOOKUP.put("Th", "Thorium");
POLLUTANT_LOOKUP.put("Pa", "Protactinium");
POLLUTANT_LOOKUP.put("U", "Uranium");
POLLUTANT_LOOKUP.put("Np", "Neptunium");
POLLUTANT_LOOKUP.put("Pu", "Plutonium");
POLLUTANT_LOOKUP.put("Am", "Americium");
POLLUTANT_LOOKUP.put("Cm", "Curium");
POLLUTANT_LOOKUP.put("Bk", "Berkelium");
POLLUTANT_LOOKUP.put("Cf", "Californium");
POLLUTANT_LOOKUP.put("Es", "Einsteinium");
POLLUTANT_LOOKUP.put("Fm", "Fermium");
POLLUTANT_LOOKUP.put("Md", "Mendelevium");
POLLUTANT_LOOKUP.put("No", "Nobelium");
POLLUTANT_LOOKUP.put("Lr", "Lawrencium");
POLLUTANT_LOOKUP.put("Rf", "Rutherfordium");
POLLUTANT_LOOKUP.put("Db", "Dubnium");
POLLUTANT_LOOKUP.put("Sg", "Seaborgium");
POLLUTANT_LOOKUP.put("Bh", "Bohrium");
POLLUTANT_LOOKUP.put("Hs", "Hassium");
POLLUTANT_LOOKUP.put("Mt", "Meitnerium");
POLLUTANT_LOOKUP.put("Ds", "Darmstadtium");
POLLUTANT_LOOKUP.put("Rg", "Roentgenium");
POLLUTANT_LOOKUP.put("Cn", "Copernicium");
POLLUTANT_LOOKUP.put("Nh", "Nihonium");
POLLUTANT_LOOKUP.put("Fl", "Flerovium");
POLLUTANT_LOOKUP.put("Mc", "Moscovium");
POLLUTANT_LOOKUP.put("Lv", "Livermorium");
POLLUTANT_LOOKUP.put("Ts", "Tennessine");
POLLUTANT_LOOKUP.put("Og", "Oganesson");
POLLUTANT_LOOKUP.put("AMMONIUM", "Ammonium");
POLLUTANT_LOOKUP.put("AMMONIA", "Ammonia");
POLLUTANT_LOOKUP.put("SULPHATE", "Sulphate");
POLLUTANT_LOOKUP.put("NITRIC ACID", "Nitric Acid");
POLLUTANT_LOOKUP.put("NITRATE", "Nitrate");
POLLUTANT_LOOKUP.put("Chrysene&Triphenylene", "Chrysene & Triphenylene");
POLLUTANT_LOOKUP.put("Chrysene & Triphenylene (C&T)", "Chrysene & Triphenylene");
POLLUTANT_LOOKUP.put("b-Pinene", "B-Pinene");
}

public static String lookupPollutantName(String pollutantName) {
return POLLUTANT_LOOKUP.getOrDefault(pollutantName, pollutantName);
}
}
14 changes: 8 additions & 6 deletions src/main/java/com/dbf/naps/data/loader/FileLoadRunner.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.dbf.naps.data.globals.PollutantMapping;
import com.dbf.naps.data.utilities.DataCleaner;

public abstract class FileLoadRunner implements Runnable {
Expand Down Expand Up @@ -99,20 +100,21 @@ protected Integer getSiteID(String napsID, long recordNumber) {
}
}

protected Integer getPollutantID(String compound, String method) {
String lookupKey = compound + "_" + method;
protected Integer getPollutantID(String rawPollutantName, String method) {
String lookupKey = rawPollutantName + "_" + method;

//If one thread stamps overrides the data of another it's no big deal
return pollutantIDLookup.computeIfAbsent(lookupKey, k -> {
return pollutantIDLookup.computeIfAbsent(lookupKey, pollutantName -> {
Integer pollutantID = null;
pollutantName = PollutantMapping.lookupPollutantName(pollutantName);
//May or may not insert, let the DB manage contention
try(SqlSession session = sqlSessionFactory.openSession(true)) {
DataMapper mapper = session.getMapper(DataMapper.class);
mapper.insertPollutant(compound, method);
pollutantID = mapper.getPollutantID(compound, method);
mapper.insertPollutant(pollutantName, method);
pollutantID = mapper.getPollutantID(pollutantName, method);
}
if(null == pollutantID) {
throw new IllegalArgumentException("Could not find matching Pollutant ID for compound " + compound + ", and method " + method);
throw new IllegalArgumentException("Could not find matching Pollutant ID for compound " + rawPollutantName + ", and method " + method);
}
return pollutantID;
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ public class IntegratedFileLoadRunner extends FileLoadRunner {
DEFAULT_IGNORED_HEADERS.add("MASS"); //Sample Mass
DEFAULT_IGNORED_HEADERS.add("SURROGATE"); //Surrogate Recovery
DEFAULT_IGNORED_HEADERS.add("48 H"); //Not sure why this is a column
DEFAULT_IGNORED_HEADERS.add("48-H"); //Same as 48 H
DEFAULT_IGNORED_HEADERS.add("CANISTER"); //Canister ID#
DEFAULT_IGNORED_HEADERS.add("CART"); //Cart, Cartridge
DEFAULT_IGNORED_HEADERS.add("START"); //Start Time
Expand All @@ -70,7 +71,7 @@ public class IntegratedFileLoadRunner extends FileLoadRunner {
DEFAULT_IGNORED_HEADERS.add("WD"); //WD
DEFAULT_IGNORED_HEADERS.add("-VFLAG"); //Validation Flag
DEFAULT_IGNORED_HEADERS.add("VOLUME"); //Actual Volume

DEFAULT_IGNORED_HEADERS.add("SITE"); //Site Type
DEFAULT_IGNORED_SHEETS.add("CHANGELOG");
DEFAULT_IGNORED_SHEETS.add("STATION");
DEFAULT_IGNORED_SHEETS.add("METADATA");
Expand Down Expand Up @@ -142,7 +143,7 @@ protected void processSheetFile(ExcelSheet sheet) throws Exception {

//Sanity check. The last column may not be the NAPS ID. We need to confirm it.
for(int col = sheet.columnCount()-1; col >= 0; col--) {
String columnHeader = sheet.getCellContents(col, headerRowNumber).toUpperCase();
String columnHeader = sheet.getCellContents(col, headerRowNumber).trim().toUpperCase();
if (columnHeader.equals("NAPS ID") || columnHeader.equals("NAPS SITE ID")) {
siteIDColumn = col;
break;
Expand Down Expand Up @@ -250,7 +251,7 @@ protected List<IntegratedDataRecord> processRow(Date date) {
//Data is expected to start on column 2
//Last column is NAPS ID and is ignored
for (int col = 1; col < getLastColumn(); col++) {
String columnHeader = getSheet().getCellContents(col, getHeaderRowNumber());
String columnHeader = getSheet().getCellContents(col, getHeaderRowNumber()).trim();
if(isColumnIgnored(columnHeader)) continue;

IntegratedDataRecord record = processSingleRecord(columnHeader, getSheet().getCellContents(col, row), date);
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/com/dbf/naps/data/utilities/DataCleaner.java
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ private static double convertDurationToHours(String duration) {
private static final Pattern COLUMN_ABBREVIATION_PATTERN = Pattern.compile(" \\([A-Za-z0-9]+\\)"); //" (BkFLT)" in "Benzo(k)Fluoranthene (BkFLT)"
private static final Map<String, String> COLUMN_ABBREVIATION_CACHE = new ConcurrentHashMap<String, String>();
public static String replaceColumnHeaderAbbreviation(String rawColumnHeader) {
rawColumnHeader = rawColumnHeader.trim();
return COLUMN_ABBREVIATION_CACHE.computeIfAbsent(rawColumnHeader, columnHeader -> {
Matcher matcher = COLUMN_ABBREVIATION_PATTERN.matcher(columnHeader);

Expand All @@ -139,6 +140,7 @@ public static String replaceColumnHeaderAbbreviation(String rawColumnHeader) {

private static final Map<String, String> COLUMN_UNITS_CACHE = new ConcurrentHashMap<String, String>();
public static String replaceColumnHeaderUnits(String rawColumnHeader) {
rawColumnHeader = rawColumnHeader.trim();
return COLUMN_UNITS_CACHE.computeIfAbsent(rawColumnHeader, columnHeader -> {
return columnHeader.replace(" (ug/m3)", "").replace(" ug/m3", ""); //Sometimes we have brackets, sometimes we don't ¯\_(ツ)_/¯
});
Expand Down

0 comments on commit 4065c2d

Please sign in to comment.