Skip to content

Commit

Permalink
header data and new scripts
Browse files Browse the repository at this point in the history
Headers for Irish data and for the others (German, Italian, and Russian
passenger lists share the same header)

Scripts for Irish famine lists data collection
  • Loading branch information
dandawg committed Mar 11, 2016
1 parent 3b28fe8 commit 867eee6
Show file tree
Hide file tree
Showing 5 changed files with 44,056 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@

#===============================================================================
# Irish Famine Header Data
#(highest value is 5730,
# total observations < 4000
# Passenger total for each ship is less than 1000)
#===============================================================================

MANIFEST IDENTIFER NUMBER SHIP NAME CODE FOR SHIP PORT OF EMBARKATION SHIP ARRIVAL DATE NUMBER OF CORRESPONDING PASSENGERS
5730 ONWARD CORK & LIVERPOOL 12/11/1851 357
5729 LEVANT DUBLIN 12/09/1851 205
5728 FLORIDA LIVERPOOL 11/28/1851 371


LINK TO DOWNLOAD
(between values)
#===============================================================================
https://aad.archives.gov/aad/download-results?ft=R&dt=1613&sc=22506%2C22508%2C22509%2C22511%2C22513&cat=GP44&tf=F&bc=%2Csl%2Cfd&q=&as_alq=&as_anq=&as_epq=&as_woq=&nfo_22506=N%2C8%2C1900&op_22506=8&

txt_22506=%d&
txt_22506=%d&

nfo_22508=V%2C21%2C1900&op_22508=0&txt_22508=&nfo_22509=V%2C3%2C1900&cl_22509=&nfo_22511=D%2C10%2C1846&op_22511=3&txt_22511=&txt_22511=&nfo_22513=N%2C3%2C1900&op_22513=6&txt_22513=&txt_22513=&mtch=222&dl=783


#===============================================================================
# Irish Famine Passenger Data
#===============================================================================

107 changes: 107 additions & 0 deletions Get_Irish_Famine_Passenger_Data/irish_famine_header.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# -*- coding: utf-8 -*-
"""
#===============================================================================
# Irish Famine Header Data
#(highest value is 5730,
# total observations < 4000
# Passenger total for each ship is less than 1000)
#===============================================================================
MANIFEST IDENTIFER NUMBER SHIP NAME CODE FOR SHIP PORT OF EMBARKATION SHIP ARRIVAL DATE NUMBER OF CORRESPONDING PASSENGERS
5730 ONWARD CORK & LIVERPOOL 12/11/1851 357
5729 LEVANT DUBLIN 12/09/1851 205
5728 FLORIDA LIVERPOOL 11/28/1851 371
LINK TO DOWNLOAD
(between values)
#===============================================================================
https://aad.archives.gov/aad/download-results?ft=R&dt=1613&sc=22506%2C22508%2C22509%2C22511%2C22513&cat=GP44&tf=F&bc=%2Csl%2Cfd&q=&as_alq=&as_anq=&as_epq=&as_woq=&nfo_22506=N%2C8%2C1900&op_22506=8&
txt_22506=%d&
txt_22506=%d&
nfo_22508=V%2C21%2C1900&op_22508=0&txt_22508=&nfo_22509=V%2C3%2C1900&cl_22509=&nfo_22511=D%2C10%2C1846&op_22511=3&txt_22511=&txt_22511=&nfo_22513=N%2C3%2C1900&op_22513=6&txt_22513=&txt_22513=&mtch=222&dl=783
Created on Wed Mar 09 18:25:28 2016
@author: Daniel
"""

import pandas as pd
import urllib2
import os,time

OUT_DIR = r'E:\w209_Final_Project_Immigration\Data\Irish_Famine_Passenger_Data'

#==============================================================================
# URL setup
#==============================================================================

url1 = 'https://aad.archives.gov/aad/download-results?ft=R&dt=1613&'+\
'sc=22506%2C22508%2C22509%2C22511%2C22513&cat=GP44&tf=F&bc=%2Csl%2Cfd&'+\
'q=&as_alq=&as_anq=&as_epq=&as_woq=&nfo_22506=N%2C8%2C1900&op_22506=8&'

url2 = 'txt_22506=%d&'
url3 = 'txt_22506=%d&'

url4 = 'nfo_22508=V%2C21%2C1900&op_22508=0&txt_22508=&nfo_22509=V%2C3%2C1900&'+\
'cl_22509=&nfo_22511=D%2C10%2C1846&op_22511=3&txt_22511=&txt_22511=&'+\
'nfo_22513=N%2C3%2C1900&op_22513=6&txt_22513=&txt_22513=&mtch=222&dl=783'

#==============================================================================
# Get Data
#==============================================================================
#download 1000 data points
def get_data(start,end):
url = url1 + url2 %start + url3 %end + url4
n = 1
while n < 6:
try:
output = pd.read_csv(url,header=0)
return output
except urllib2.URLError:
print 'Download for data with start=%d and end=%d failed' %(start,end)
print 'Retry #%d' %n
n += 1
time.sleep(2)
raise urllib2.URLError

#append data to csv file
def append_data(data, file_name, path=OUT_DIR):
'''append data in csv format to file_name file'''
full_path = os.path.join(path,file_name)
if os.path.exists(full_path):
with open(full_path,'a') as f:
data.to_csv(f, header=False, index=False)
else:
with open(full_path,'a') as f:
data.to_csv(f, header=True, index=False)

#build cvs file
def mkDataset_1(start, end, file_name, path=OUT_DIR):
print 'Starting data acquisition'
st = start
itrs = []
tm1 = time.time()
while st < end:
nd = st + 999
itrs.append((st,nd))
st = nd + 1
for i in xrange(len(itrs)):
#pause
time.sleep(.01)
#check progress
tm2 = time.time()
if tm2 - tm1 > 9:
print 'Getting range: %s \ttime: %s' %(str(itrs[i]),time.ctime())
tm1 = tm2
#read data from url
data = get_data(itrs[i][0],itrs[i][1])
if len(data) == 0:
continue
append_data(data, file_name, path) #write data to file

if __name__ == '__main__':
# mkDataset(1795701,99999999,'russian_manifest_headers.csv')
mkDataset_1(0,6000,'irish_famine_manifest_headers.csv')
101 changes: 101 additions & 0 deletions Get_Irish_Famine_Passenger_Data/irish_famine_passenger_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 09 18:56:10 2016
@author: Daniel
"""

import pandas as pd
import urllib2
import os,time

OUT_DIR = r'E:\w209_Final_Project_Immigration\Data\Irish_Famine_Passenger_Data'
EXCEPTION_FILE = os.path.join(OUT_DIR,'irish_exceptions_log.csv')
MANIFEST_FILE = os.path.join(OUT_DIR,'irish_famine_manifest_headers.csv')

#==============================================================================
# URL setup
#==============================================================================

url1 = 'https://aad.archives.gov/aad/download-results?ft=R&dt=180&'+\
'sc=17169%2C17170%2C17172%2C17189%2C17177%2C17180%2C17190%2C17181&'+\
'cat=GP44&tf=F&bc=%2Csl%2Cfd&q=&as_alq=&as_anq=&as_epq=&as_woq=&'+\
'nfo_17169=V%2C20%2C1900&op_17169=0&txt_17169=&nfo_17170=V%2C19%2C1900&'+\
'op_17170=0&txt_17170=&nfo_17172=N%2C3%2C1900&cl_17172=&'+\
'nfo_17189=N%2C3%2C1900&cl_17189=&nfo_17177=V%2C20%2C1900&op_17177=0&'+\
'txt_17177=&nfo_17180=N%2C3%2C1900&cl_17180=&nfo_17190=N%2C8%2C1900&'

url2 = 'cl_17190=%d&'

url3 = 'nfo_17181=D%2C10%2C1846&op_17181=3&txt_17181=&txt_17181=&mtch=357&dl=412'

#==============================================================================
# Get Data
#==============================================================================
#download 1000 data points
def get_data(manifest_id):
url = url1 + url2 %manifest_id + url3
n = 1
while n < 6:
try:
output = pd.read_csv(url,header=0)
return output
except urllib2.URLError:
print 'Download for data with start=%d and end=%d failed' %(start,end)
print 'Retry #%d' %n
n += 1
time.sleep(2)
raise urllib2.URLError

#append data to csv file
def append_data(data, file_name, path=OUT_DIR):
'''append data in csv format to file_name file'''
full_path = os.path.join(path,file_name)
if os.path.exists(full_path):
with open(full_path,'a') as f:
data.to_csv(f, header=False, index=False)
else:
with open(full_path,'a') as f:
data.to_csv(f, header=True, index=False)

#build cvs file
def mkDataset(num,
file_name,
manifest_file = MANIFEST_FILE,
path = OUT_DIR,
exception_file = EXCEPTION_FILE):
'''get data starting at manifest number 'num' '''
print 'Starting data acquisition'
tm1 = time.time()
recs = 0
#get manifest data
mnfst = pd.read_csv(manifest_file,header=0
).sort(columns=["MANIFEST IDENTIFER NUMBER"])
mnums = mnfst["MANIFEST IDENTIFER NUMBER"]
mnums = mnums[mnums>=num]
#iterate over manifest numbers
for i in mnums:
#pause
time.sleep(.002)
#check progress
tm2 = time.time()
if tm2 - tm1 > 9:
print 'manifest: %d \trecords: %d\ttime: %s' %(i,
recs,
time.ctime())
tm1 = tm2
#download and append data
data = get_data(i)
if len(data) == 0:
continue
if len(data) > 999:
#data is probably truncated due to 1000 max download
#note exception in exception log
with open(EXCEPTION_FILE,'a') as ef:
ef.write(str(i)+'\n')
continue
append_data(data,file_name)
recs += len(data)

if __name__ == '__main__':
mkDataset(3899,'irish_famine_passenger_data.csv')
Loading

0 comments on commit 867eee6

Please sign in to comment.