Skip to content

Commit

Permalink
Various file parser fixes for the integrated datasets.
Browse files Browse the repository at this point in the history
  • Loading branch information
dbeaudoinfortin committed Jun 10, 2024
1 parent b97ee3f commit 73df4e0
Show file tree
Hide file tree
Showing 13 changed files with 123 additions and 44 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ You can invoke this tool by running the class com.dbf.naps.data.download.integra

## NAPSIntegratedDataLoader

A Java tool that loads all of the raw integrated data (downloaded by the NAPSIntegratedDataDownloader) from the provided directory into a PostgreSQL database, as specified. The database schema is automatically created when the tool runs. This tool automatically cleans-up and fixes data inconsistencies as it finds them. Once all the data is loaded, there should be about ____ million rows of data (as of May 2024) in the integrated_data table of your database.
A Java tool that loads all of the raw integrated data (downloaded by the NAPSIntegratedDataDownloader) from the provided directory into a PostgreSQL database, as specified. The database schema is automatically created when the tool runs. This tool automatically cleans-up and fixes data inconsistencies as it finds them. Once all the data is loaded, there should be about 65 million rows of data (as of May 2024) in the integrated_data table of your database.

You can invoke this tool by running the class com.dbf.naps.data.loader.integrated.NAPSIntegratedDataLoader.

Expand Down
1 change: 1 addition & 0 deletions src/main/java/com/dbf/excel/BIFF8ExcelSheet.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ public class BIFF8ExcelSheet extends BaseExcelSheet {
private Sheet sheet;

public BIFF8ExcelSheet(Sheet sheet) {
super();
this.sheet = sheet;
}

Expand Down
11 changes: 8 additions & 3 deletions src/main/java/com/dbf/excel/BaseExcelSheet.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,16 @@
public abstract class BaseExcelSheet implements ExcelSheet {

//Note: SimpleDateFormat is not thread safe, must not be static
protected final SimpleDateFormat TYPICAL_DATE_FORMAT = new SimpleDateFormat("MM-dd-yy");
protected final SimpleDateFormat TYPICAL_DATE_FORMAT = new SimpleDateFormat("MM-dd-yy", Locale.ENGLISH);
protected final SimpleDateFormat NEWER_DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd", Locale.ENGLISH); //2010 and beyond
protected final SimpleDateFormat BAD_DATE_FORMAT = new SimpleDateFormat("dd-MMM-yy", Locale.ENGLISH);


public BaseExcelSheet() {
TYPICAL_DATE_FORMAT.setLenient(false);
NEWER_DATE_FORMAT.setLenient(false);
BAD_DATE_FORMAT.setLenient(false);
}

//Work-around for all the strange date formats across several excel versions
protected Date extractRawDate(String rawDate) {
try {
Expand All @@ -39,5 +45,4 @@ protected Date extractRawDate(String rawDate) {
}
}
}

}
44 changes: 22 additions & 22 deletions src/main/java/com/dbf/excel/ExcelSheetFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ public class ExcelSheetFactory {

private static final Logger log = LoggerFactory.getLogger(ExcelSheetFactory.class);

public static List<ExcelSheet> getSheets(File excelFile, List<String> matchingSheetNames) throws IOException {
if(excelFile.getName().toUpperCase().endsWith(".XLS")) return createXLSSheet(excelFile, matchingSheetNames);
public static List<ExcelSheet> getSheets(File excelFile, List<String> matchingSheetNames, List<String> excludedSheetNames) throws IOException {
if(excelFile.getName().toUpperCase().endsWith(".XLS")) return createXLSSheet(excelFile, matchingSheetNames, excludedSheetNames);

//Handle XLSX sheet
log.info("Trying to load XLSX Excel workbook " + excelFile + " into memory.");
Expand All @@ -34,19 +34,20 @@ public static List<ExcelSheet> getSheets(File excelFile, List<String> matchingSh

//We need to find the sheet that we want using a case-insensitive partial match approach
List<ExcelSheet> matchingSheets = new ArrayList<ExcelSheet>();
List<String> matchingSheetNamesUpper = getMatchingSheetNames(matchingSheetNames);
for(int i = 0; i < workbook.getNumberOfSheets(); i++ ) {
if (sheetNameMatches(workbook.getSheetName(i), matchingSheetNamesUpper))
if (sheetNameMatches(workbook.getSheetName(i), matchingSheetNames, excludedSheetNames))
matchingSheets.add(new XLSXExcelSheet(workbook.getSheetAt(i)));
}

if (matchingSheets.isEmpty())
throw new IllegalArgumentException("Could not locate a matching sheet inside of the workbook using sheet names: " + matchingSheetNames);
if (matchingSheets.isEmpty()) {
log.warn("Could not locate a matching sheet inside of the workbook using sheet names " + matchingSheetNames + ". Falling back to the first sheet.");
matchingSheets.add(new XLSXExcelSheet(workbook.getSheetAt(0)));
}
return matchingSheets;
}
}

private static List<ExcelSheet> createXLSSheet(File excelFile, List<String> matchingSheetNames) throws IOException {
private static List<ExcelSheet> createXLSSheet(File excelFile, List<String> matchingSheetNames, List<String> excludedSheetNames) throws IOException {
//NOTE: the XLS data files are inconsistent with most seemingly in BIFF4 format and some in BIFF8 format.
//I suspect they were all initially generated in BIFF4 format and some were later corrected and re-uploaded in BIFF8.
//This is surprising since data up until 2009 is in BIFF4, which is a format from 1992.
Expand All @@ -63,14 +64,15 @@ private static List<ExcelSheet> createXLSSheet(File excelFile, List<String> matc

//We need to find the sheet that we want using a case-insensitive partial match approach
List<ExcelSheet> matchingSheets = new ArrayList<ExcelSheet>();
List<String> matchingSheetNamesUpper = getMatchingSheetNames(matchingSheetNames);
for(String sheetName : workbook.getSheetNames()) {
if (sheetNameMatches(sheetName, matchingSheetNamesUpper))
if (sheetNameMatches(sheetName, matchingSheetNames, excludedSheetNames))
matchingSheets.add(new BIFF8ExcelSheet(workbook.getSheet(sheetName)));
}
if (matchingSheets.isEmpty())
throw new IllegalArgumentException("Could not locate a matching sheet inside of the workbook using sheet names: " + matchingSheetNames);

if (matchingSheets.isEmpty()) {
log.warn("Could not locate a matching sheet inside of the workbook using sheet names " + matchingSheetNames + ". Falling back to the first sheet.");
matchingSheets.add(new BIFF8ExcelSheet(workbook.getSheet(0)));
}

//Note: the input stream is already closed, we don't really want to call workbook.close()
//since that will clear the underlying data
return matchingSheets;
Expand All @@ -81,20 +83,18 @@ private static List<ExcelSheet> createXLSSheet(File excelFile, List<String> matc
}
}

private static boolean sheetNameMatches(String sheetName, List<String> matchingSheetNames) {
private static boolean sheetNameMatches(String sheetName, List<String> matchingSheetNames, List<String> excludedSheetNames) {
String sheetNameUpper = sheetName.toUpperCase();

for (String excludedSheetName : excludedSheetNames) {
if (sheetNameUpper.startsWith(excludedSheetName)) return false;
}

for (String matchingSheetName : matchingSheetNames) {
if (sheetNameUpper.startsWith(matchingSheetName)) return true;
}
log.debug("Ignored sheet " + sheetName);

log.warn("Ignoring unexpected sheet: " + sheetName);
return false;
}

private static List<String> getMatchingSheetNames(List<String> matchingSheetNames) {
List<String> matchingSheetNamesUpper = new ArrayList<String>(matchingSheetNames.size());
for(String sheetName : matchingSheetNames) {
matchingSheetNamesUpper.add(sheetName.toUpperCase());
}
return matchingSheetNamesUpper;
}
}
1 change: 1 addition & 0 deletions src/main/java/com/dbf/excel/OldBIFFExcelSheet.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ public class OldBIFFExcelSheet extends RawDataExcelSheet {
private BOFRecord bof;

public OldBIFFExcelSheet(File excelFile) throws IOException {
super();
loadOldBIFFFile(excelFile);
}

Expand Down
4 changes: 4 additions & 0 deletions src/main/java/com/dbf/excel/RawDataExcelSheet.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ public abstract class RawDataExcelSheet extends BaseExcelSheet {

protected String sheetName;

public RawDataExcelSheet() {
super();
}

@Override
public int columnCount() {
return rawData.length;
Expand Down
25 changes: 22 additions & 3 deletions src/main/java/com/dbf/excel/XLSXExcelSheet.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package com.dbf.excel;

import java.util.Date;

import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.Row;
Expand All @@ -8,23 +10,32 @@
public class XLSXExcelSheet extends RawDataExcelSheet {

public XLSXExcelSheet(Sheet sheet) {
super();
loadXLSXFFile(sheet);
}

private void loadXLSXFFile(Sheet sheet) {
this.sheetName = sheet.getSheetName();
int rows = sheet.getPhysicalNumberOfRows();
int rows = sheet.getLastRowNum();

if (rows > 9000) {
throw new IllegalArgumentException("Sheet is too big. Row count of " + rows + " exceeds the maximum of 9000.");
}

// Determine the maximum number of columns we will need.
// Note that the number of columns varies per row
int cols = 0;
for (int r = 0; r < rows; r++) {
Row row = sheet.getRow(r);
if (row == null) continue;
cols = Math.max(cols, row.getPhysicalNumberOfCells());
cols = Math.max(cols, row.getLastCellNum());
}

if (cols > 500) {
throw new IllegalArgumentException("Sheet is too big. Column count of " + cols + " exceeds the maximum of 150.");
}

rawData = new String[cols][rows];
rawData = new String[cols][rows+1];

// Copy the raw cell contents into the rawData array
for (int r = 0; r < rows; r++) {
Expand Down Expand Up @@ -66,4 +77,12 @@ private String getCellValueAsString(Cell cell) {
return "Unsupported Cell Type";
}
}

@Override
public Date getCellDate(int column, int row) {
String rawDate = rawData[column][row];
if(null == rawDate || "".equals(rawDate)) return null;

return extractRawDate(rawDate);
}
}
4 changes: 4 additions & 0 deletions src/main/java/com/dbf/naps/data/loader/NAPSDataLoader.java
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,16 @@ private void loadFiles() throws IOException {
List<Future<?>> futures = new ArrayList<Future<?>>();
final Path rawPath = getOptions().getDataPath();

log.info("Examining all files at the path " + rawPath);

if(!rawPath.toFile().isDirectory()) {
log.error("The path to the raw data is not valid: " + rawPath);
return;
}

recurseDir(rawPath, futures);
log.info("All files have been examined. " + futures.size() + " task(s) have been created. Waiting for completion...");

waitForTaskCompletion(futures);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,16 @@
public class NAPSIntegratedDataLoader extends NAPSDataLoader {

private static final List<IntegratedRunnerMapping> mappings = new ArrayList<IntegratedRunnerMapping>();
private static final List<Pattern> excludedPatterns = new ArrayList<Pattern>();
static {
mappings.add(new IntegratedRunnerMapping(CFFileLoadRunner.class, "DICHOT", "_DICH.XLS"));
mappings.add(new IntegratedRunnerMapping(CFFileLoadRunner.class, "PM2.5", "_PART25.XLS"));

mappings.add(new IntegratedRunnerMapping(SampleMetaDataFileLoadRunner.class, "PAH", "_PAH.XLS"));
mappings.add(new IntegratedRunnerMapping(SampleMetaDataFileLoadRunner.class, "HCB", "_HCB.XLS"));
mappings.add(new IntegratedRunnerMapping(SampleMetaDataFileLoadRunner.class, "VOC", "_VOC.XLS"));
mappings.add(new IntegratedRunnerMapping(SampleMetaDataFileLoadRunner.class, "VOC", "_VOCS.XLS")); //One file is mis-named :)
mappings.add(new IntegratedRunnerMapping(SampleMetaDataFileLoadRunner.class, "PCDD", "_PCDD.XLSX"));
mappings.add(new IntegratedRunnerMapping(SampleMetaDataFileLoadRunner.class, "PCDD", "_PCDD.XLS"));
mappings.add(new IntegratedRunnerMapping(SampleMetaDataFileLoadRunner.class, "PCB", "_PCB.XLS"));

Expand All @@ -44,12 +47,17 @@ public class NAPSIntegratedDataLoader extends NAPSDataLoader {

mappings.add(new IntegratedRunnerMapping(XLSXFileLoadRunner.class, "PAH", Pattern.compile("S[0-9]+_PAH_[0-9]{4}(_EN)?\\.XLSX"))); //Match S90121_PAH_2010.XLSX
mappings.add(new IntegratedRunnerMapping(XLSXFileLoadRunner.class, "PM2.5", Pattern.compile("S[0-9]+_PM25_[0-9]{4}(_EN)?\\.XLSX"))); //Match S40103_PM25_2010.XLSX
mappings.add(new IntegratedRunnerMapping(XLSXFileLoadRunner.class, "PM2.5-10", Pattern.compile("S[0-9]+_PM25_[0-9]{4}\\-10(_EN)?\\.XLSX"))); //Match S30113_PM25-10_2010.XLSX
mappings.add(new IntegratedRunnerMapping(XLSXFileLoadRunner.class, "PM2.5-10", Pattern.compile("S[0-9]+_PM25\\-10_[0-9]{4}(_EN)?\\.XLSX"))); //Match S30113_PM25-10_2010.XLSX

mappings.add(new IntegratedRunnerMapping(XLSXFileLoadRunner.class, "CARB", Pattern.compile("S[0-9]+_CARBONYLS_[0-9]{4}(_EN)?\\.XLSX"))); //Match S070119_CARBONYLS_2018_EN.XLSX
mappings.add(new IntegratedRunnerMapping(XLSXFileLoadRunner.class, "VOC", Pattern.compile("S[0-9]+_VOC_[0-9]{4}(_EN)?\\.XLSX"))); //Match S070119_VOC_2018_EN.XLSX

mappings.add(new IntegratedRunnerMapping(VOCFileLoadRunner.class, "VOC", Pattern.compile("S[0-9]+_VOC_[0-9]{4}(_EN)?\\.XLS"))); //Match S54401_VOC_2016_EN.XLS
mappings.add(new IntegratedRunnerMapping(VOCFileLoadRunner.class, "VOC", Pattern.compile("S[0-9]+(_24HR)?_VOC_[0-9]{4}(_EN)?\\.XLS"))); //Match S54401_VOC_2016_EN.XLS, S62601_24hr_VOC_2014.XLS
mappings.add(new IntegratedRunnerMapping(VOCFileLoadRunner.class, "VOC_4HR", Pattern.compile("S[0-9]+_4HR_VOC_[0-9]{4}(_EN)?\\.XLS"))); //S62601_4hr_VOC_2014.XLS
mappings.add(new IntegratedRunnerMapping(CarbonylsFileLoadRunner.class, "CARB", Pattern.compile("S[0-9]+_CARBONYLS_[0-9]{4}(_EN)?\\.XLS"))); //Match S54401_CARBONYLS_2016_EN.XLS

excludedPatterns.add(Pattern.compile("S[0-9]+_PM25_[0-9]{4}(_EN)?\\.XLS"));
excludedPatterns.add(Pattern.compile("S[0-9]+_PM25_[0-9]{4}(_EN)?\\.XLS"));
}

public NAPSIntegratedDataLoader(String[] args) {
Expand All @@ -71,6 +79,10 @@ protected Collection<Runnable> processFile(File dataFile) {

String fileName = dataFile.getName().toUpperCase();

//Ignore changelogs
if(fileName.startsWith("CHANGE") )
return Collections.emptyList();

//Ignore the CSV versions of the files, we will process the Excel sheets
if(fileName.endsWith(".CSV") )
return Collections.emptyList();
Expand All @@ -79,6 +91,12 @@ protected Collection<Runnable> processFile(File dataFile) {
if(fileName.endsWith("_FR.XLS") || fileName.endsWith("_FR.XLSX"))
return Collections.emptyList();

//These are files that we explicitly know we don't want to read
for(Pattern excludedPattern : excludedPatterns) {
if(excludedPattern.matcher(fileName).matches())
return Collections.emptyList();
}

try {
for(IntegratedRunnerMapping mapping : mappings) {
if((null != mapping.getFileNameMatch() && fileName.endsWith(mapping.getFileNameMatch()))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@ public class IntegratedFileLoadRunner extends FileLoadRunner {

private static final Logger log = LoggerFactory.getLogger(IntegratedFileLoadRunner.class);

//This are all of the known headers that are derived or represent metadata rather than raw data.
//These are all of the known headers that are derived or represent metadata rather than raw data.
private static final List<String> DEFAULT_IGNORED_HEADERS = new ArrayList<String>();

//These are all of the know sheet that can be safely ignored
private static final List<String> DEFAULT_IGNORED_SHEETS = new ArrayList<String>();
static {
DEFAULT_IGNORED_HEADERS.add("%"); //% Recovery
DEFAULT_IGNORED_HEADERS.add("RECOVERY"); //Recovery %, Recovery-AE, Recovery-PHE, etc.
Expand Down Expand Up @@ -68,6 +70,11 @@ public class IntegratedFileLoadRunner extends FileLoadRunner {
DEFAULT_IGNORED_HEADERS.add("WD"); //WD
DEFAULT_IGNORED_HEADERS.add("-VFLAG"); //Validation Flag
DEFAULT_IGNORED_HEADERS.add("VOLUME"); //Actual Volume

DEFAULT_IGNORED_SHEETS.add("CHANGELOG");
DEFAULT_IGNORED_SHEETS.add("STATION");
DEFAULT_IGNORED_SHEETS.add("METADATA");
DEFAULT_IGNORED_SHEETS.add("TSP");
}

//State held during processing
Expand All @@ -87,12 +94,12 @@ public IntegratedFileLoadRunner(int threadId, LoaderOptions config, SqlSessionFa
}

/**
* Main entry-point method for processing the sheet.
* Main entry-point method for processing the workbook.
*/
@Override
protected void processFile() throws Exception {
log.info(getThreadId() + ":: Starting to parse data from Excel workbook " + getRawFile() + ".");
List<ExcelSheet> sheets = ExcelSheetFactory.getSheets(getRawFile(), getMatchingSheetNames());
List<ExcelSheet> sheets = ExcelSheetFactory.getSheets(getRawFile(), getMatchingSheetNames(), getExcludedSheetNames());
log.info(getThreadId() + ":: Found " + sheets.size() + " matching sheet(s).");

for(ExcelSheet sheet : sheets) {
Expand All @@ -104,18 +111,24 @@ protected List<String> getMatchingSheetNames() {
return Collections.singletonList(fileType);
}

protected List<String> getExcludedSheetNames() {
return DEFAULT_IGNORED_SHEETS;
}

protected void setMethod() {
this.method = "INT_" + fileType;
}

/**
* Processing of a single sheet of the workbook.
*/
protected void processSheetFile(ExcelSheet sheet) throws Exception {
this.sheet = sheet;

//Method may differ per sheet
setMethod();

log.info(getThreadId() + ":: Processing sheet " + sheet.getName() + ".");

log.info(getThreadId() + ":: Processing sheet" + (sheet.getName() == null ? "" : " " + sheet.getName()) + ".");
List<IntegratedDataRecord> records = new ArrayList<IntegratedDataRecord>(100);

//The first row is skipped. It has information in the form of:
Expand All @@ -135,7 +148,13 @@ protected void processSheetFile(ExcelSheet sheet) throws Exception {
break;
}
}
if(null == siteIDColumn) throw new IllegalArgumentException("Could not locate the NAPS ID column.");
if(null == siteIDColumn) {
//Some of the PM25 files are just quick summaries and don't contain full data. Add a quick sanity check.
if(fileType.equals("PM2.5") && sheet.getCellContents(0, 0).toUpperCase().startsWith("SAMPLING DATE")) {
return;
}
throw new IllegalArgumentException("Could not locate the NAPS ID column.");
}

preProcessRow();
//Done with header validation, ready to process the first row of data
Expand All @@ -149,7 +168,9 @@ protected void processSheetFile(ExcelSheet sheet) throws Exception {
//First column contains the date in the form of 11-20-84, unless the first column is being used as the NAPS Site ID
date = sheet.getCellDate(dateColumn, row);
} catch(IllegalArgumentException e) {
log.warn("Expected a date for column " + dateColumn + ", row " + row + ". Raw value is: " + sheet.getCellContents(0, row));
if(!sheet.getCellContents(dateColumn, row).startsWith("Sampling")) {
log.error("Expected a date for column " + dateColumn + ", row " + row + ". Raw value is: " + sheet.getCellContents(dateColumn, row));
}
continue; //This could be bad data or it could simply be a footer
}

Expand Down Expand Up @@ -191,7 +212,7 @@ protected void processSheetFile(ExcelSheet sheet) throws Exception {
*/
protected String[] getFirstColumnHeaders() {
//COMPOUND is sometimes "COMPOUND" and sometimes "COMPOUNDS"
return new String[] {"COMPOUND","DATE","CONGENER","SAMPLING", "NAPS SITE ID"};
return new String[] {"COMPOUND","DATE","CONGENER","SAMPLING", "NAPS SITE ID", "NAPS ID"};
}

/**
Expand Down
Loading

0 comments on commit 73df4e0

Please sign in to comment.