-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Scripts to download Russian Passenger Data
The header script is to get information on the ships. The second script uses the data downloaded from the first (ship manifest ids), to then download data on passengers.
- Loading branch information
Showing
2 changed files
with
259 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Russians to America Passenger Data File, 1834 - 1897 | ||
#=============================================================================== | ||
Plan: | ||
Use collected Manifest IDs (in csv, loaded to pandas?) | ||
If a query returns 1000 results, the dataset might be truncated | ||
In this case recursively half the range of IDs until 1 | ||
If 1 ID retuns 1000 records, flag this ID in a file (we will need to get the | ||
data for that ship another way) | ||
Created on Sat March 05 20:52:17 2016 | ||
@author: Daniel | ||
""" | ||
|
||
#if __name__=='__main__': | ||
|
||
import pandas as pd | ||
import urllib2 | ||
import time, os | ||
|
||
OUT_DIR = r'E:\w209_Final_Project_Immigration\Data' | ||
EXCEPTION_FILE = os.path.join(OUT_DIR,'russian_exceptions_log.csv') | ||
MANIFEST_FILE = os.path.join(OUT_DIR,'russian_manifest_headers.csv') | ||
|
||
#============================================================================== | ||
# URL setup | ||
#============================================================================== | ||
#equals manifest number | ||
url1 = 'https://aad.archives.gov/aad/download-results?ft=R&'+\ | ||
'dt=2126&sc=25924%2C25925%2C25926%2C25930%2C25932%2C25934&cat=GP44&'+\ | ||
'tf=F&bc=%2Csl%2Cfd&q=&as_alq=&as_anq=&as_epq=&as_woq=&'+\ | ||
'nfo_25924=V%2C50%2C1900&op_25924=0&txt_25924=&nfo_25925=V%2C50%2C1900&'+\ | ||
'op_25925=0&txt_25925=&nfo_25926=V%2C10%2C1900&cl_25926=&'+\ | ||
'nfo_25930=V%2C3%2C1900&cl_25930=&nfo_25932=V%2C20%2C1900&op_25932=0&'+\ | ||
'txt_25932=&nfo_25934=N%2C8%2C1900&op_25934=3&' | ||
|
||
url2 = 'txt_25934=%d&' | ||
|
||
url3 = 'txt_25934=&mtch=483&dl=1327' | ||
|
||
#This link below was for between two manifest numbers | ||
#url1 = 'https://aad.archives.gov/aad/download-results?ft=R&dt=2126&'+\ | ||
# 'sc=25924%2C25925%2C25926%2C25930%2C25932%2C25934&cat=GP44&tf=F&'+\ | ||
# 'bc=%2Csl%2Cfd&q=&btnSearch=Search&as_alq=&as_anq=&as_epq=&as_woq=&'+\ | ||
# 'nfo_25924=V%2C50%2C1900&op_25924=0&txt_25924=&nfo_25925=V%2C50%2C1900&'+\ | ||
# 'op_25925=0&txt_25925=&nfo_25926=V%2C10%2C1900&cl_25926=&'+\ | ||
# 'nfo_25930=V%2C3%2C1900&cl_25930=&nfo_25932=V%2C20%2C1900&op_25932=0&'+\ | ||
# 'txt_25932=&nfo_25934=N%2C8%2C1900&op_25934=8&' | ||
# | ||
#url2 = 'txt_25934=%d&' | ||
#url3 = 'txt_25934=%d&' | ||
# | ||
#url4 = 'mtch=258&dl=1327' | ||
|
||
#============================================================================== | ||
# Get Data | ||
#============================================================================== | ||
#download data points | ||
def get_data(num): | ||
url = url1 + url2 %num + url3 | ||
n = 1 | ||
while n < 6: | ||
try: | ||
output = pd.read_csv(url,header=0) | ||
return output | ||
except urllib2.URLError: | ||
print 'Download for data with manifest number=%d failed' %(num) | ||
print 'Retry #%d' %n | ||
n += 1 | ||
time.sleep(2) | ||
raise urllib2.URLError | ||
|
||
#append data to csv file | ||
def append_data(data, file_name, path=OUT_DIR): | ||
'''Append csv data to csv file file_name''' | ||
full_path = os.path.join(path,file_name) | ||
if os.path.exists(full_path): | ||
with open(full_path,'a') as f: | ||
data.to_csv(f, header=False, index=False) | ||
else: | ||
with open(full_path,'a') as f: | ||
data.to_csv(f, header=True, index=False) | ||
|
||
#build cvs file | ||
def mkDataset(num, | ||
file_name, | ||
manifest_file = MANIFEST_FILE, | ||
path = OUT_DIR, | ||
exception_file = EXCEPTION_FILE): | ||
'''get data starting at manifest number 'num' ''' | ||
print 'Starting data acquisition' | ||
tm1 = time.time() | ||
recs = 0 | ||
#get manifest data | ||
mnfst = pd.read_csv(manifest_file,header=0 | ||
).sort(columns=["Manifest Identification Number"]) | ||
mnums = mnfst["Manifest Identification Number"] | ||
mnums = mnums[mnums>=num] | ||
#iterate over manifest numbers | ||
for i in mnums: | ||
#pause | ||
time.sleep(.01) | ||
#check progress | ||
tm2 = time.time() | ||
if tm2 - tm1 > 9: | ||
print 'manifest: %d % 3.1f%%\trecords: %d\ttime: %s' %(i, | ||
(float(int(mnums[mnums==i].index))/len(mnums))*100, | ||
recs, | ||
time.ctime()) | ||
tm1 = tm2 | ||
#download and append data | ||
data = get_data(i) | ||
if len(data) == 0: | ||
continue | ||
if len(data) > 999: | ||
#data is probably truncated due to 1000 max download | ||
#note exception in exception log | ||
with open(EXCEPTION_FILE,'a') as ef: | ||
ef.write(str(i)+'\n') | ||
continue | ||
append_data(data,file_name) | ||
recs += len(data) | ||
|
||
#============================================================================== | ||
# execute | ||
#============================================================================== | ||
|
||
if __name__ == '__main__': | ||
mkDataset(43516,'Russian_passenger_data.csv') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Manifest Header Data File, 1834 - ca. 1900 | ||
(https://aad.archives.gov/aad/fielded-search.jsp?dt=2127&cat=GP44&tf=F&bc=,sl) | ||
#=============================================================================== | ||
First 5 records | ||
=============== | ||
Manifest Identification Number | Ship Name | Port of Departure (Embarkation) | Date of Arrival | ||
1 MASSILIA MARSEILLES & NAPLES 04/04/1894 | ||
2 LOUISA LIVERPOOL 05/30/1848 | ||
4 ITALIA NAPLES 04/07/1894 | ||
24687 POLAND HAVRE 05/19/1834 | ||
24688 ANNE & EMILIE BREMEN 05/20/1834 | ||
24689 TURBO HAVRE 05/19/1834 | ||
NOTE: NOT ORDERED BY DATE | ||
last record | ||
=========== | ||
808257 SAN GUGLIELMO NAPLES 12/08/1912 | ||
99999999 CITY OF WASHINGTON LIVERPOOL 12/29/1871 | ||
SHAW CATHERINE age 34 FEMALE UNKNOWN UNKNOWN GALICIA USA 903276 | ||
Plan: | ||
===== | ||
Get all records from Manifest Header Data File | ||
iterate over possible manifest numbers in increments of 1000 | ||
Created on Fri Mar 04 20:03:19 2016 | ||
@author: Daniel | ||
""" | ||
|
||
import pandas as pd | ||
import urllib2 | ||
import os,time | ||
|
||
OUT_DIR = r'E:\w209_Final_Project_Immigration\Data' | ||
|
||
#============================================================================== | ||
# URL setup | ||
#============================================================================== | ||
|
||
url1 = 'https://aad.archives.gov/aad/download-results?ft=R&dt=2127&'+\ | ||
'sc=25490%2C25491%2C25492%2C25493&cat=GP44&tf=F&bc=%2Csl%2Cfd&q=&'+\ | ||
'btnSearch=Search&as_alq=&as_anq=&as_epq=&as_woq=&nfo_25490=N%2C8%2C1900&'+\ | ||
'op_25490=8&' | ||
|
||
url2 = 'txt_25490=%d&' | ||
url3 = 'txt_25490=%d&' | ||
|
||
url4 = 'nfo_25491=V%2C50%2C1900&op_25491=0&txt_25491=&nfo_25492=V%2C3%2C1900&'+\ | ||
'cl_25492=&nfo_25493=D%2C18%2C1834&op_25493=8&txt_25493=&txt_25493=&'+\ | ||
'mtch=399&dl=1277' | ||
|
||
#============================================================================== | ||
# Get Data | ||
#============================================================================== | ||
#download 1000 data points | ||
def get_data(start,end): | ||
url = url1 + url2 %start + url3 %end + url4 | ||
n = 1 | ||
while n < 6: | ||
try: | ||
output = pd.read_csv(url,header=0) | ||
return output | ||
except urllib2.URLError: | ||
print 'Download for data with start=%d and end=%d failed' %(start,end) | ||
print 'Retry #%d' %n | ||
n += 1 | ||
time.sleep(2) | ||
raise urllib2.URLError | ||
|
||
#append data to csv file | ||
def append_data(data, file_name, path=OUT_DIR): | ||
full_path = os.path.join(path,file_name) | ||
if os.path.exists(full_path): | ||
with open(full_path,'a') as f: | ||
data.to_csv(f, header=False, index=False) | ||
else: | ||
with open(full_path,'a') as f: | ||
data.to_csv(f, header=True, index=False) | ||
|
||
#build cvs file | ||
def mkDataset(start, end, file_name, path=OUT_DIR): | ||
print 'Starting data acquisition' | ||
st = start | ||
itrs = [] | ||
tm1 = time.time() | ||
while st < end: | ||
itrs.append(st) | ||
st += 999 | ||
itrs.append(end) | ||
for i in xrange(len(itrs)-1): | ||
#pause | ||
time.sleep(.05) | ||
#check progress | ||
tm2 = time.time() | ||
if tm2 - tm1 > 9: | ||
print 'progress: %d % 3.1f%%' %(itrs[i],(float(itrs[i])/end)*100) | ||
tm1 = tm2 | ||
#read data from url | ||
data = get_data(itrs[i],itrs[i+1]) | ||
if len(data) == 0: | ||
continue | ||
append_data(data, file_name, path) #write data to file | ||
|
||
#TODO: mkDataset returns duplicate row on each iteration changeover | ||
|
||
def manMKdataset(start, end, file_name, path=OUT_DIR): | ||
'''manually download data''' | ||
data = get_data(start,end) #read data from url | ||
append_data(data, file_name, path) #write data to file | ||
|
||
#============================================================================== | ||
# execute | ||
#============================================================================== | ||
|
||
if __name__ == '__main__': | ||
# mkDataset(1795701,99999999,'russian_manifest_headers.csv') | ||
manMKdataset(1795701,99999999,'russian_manifest_headers.csv') | ||
|
||
|
||
|