Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ruffen up #131

Merged
merged 7 commits into from
Oct 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,6 @@ jobs:
- name: Run tests
run: |
poetry run pytest
- name: Run ruff
run: |
poetry run ruff check
73 changes: 0 additions & 73 deletions analysis/analyze_movement.py

This file was deleted.

19 changes: 4 additions & 15 deletions analysis/extreme_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,27 +7,16 @@
See https://github.com/danvk/oldnyc/issues/3
'''

import os, sys
parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0,parentdir)
import re

import record
from data.item import load_items

def extract_dates(date_str):
return re.findall(r'\b(1[6789]\d\d)\b', date_str)


def mkurl(r):
return 'http://digitalcollections.nypl.org/items/image_id/%s' % (
re.sub(r'-[a-z]$', '', r.photo_id()))


if __name__ == '__main__':
rs = record.AllRecords('nyc/records.pickle')
rs = load_items("data/images.ndjson")
for r in rs:
dstr = re.sub(r'\s+', ' ', r.date())
if not dstr: continue
for d in extract_dates(dstr):
for d in extract_dates(r.date or ""):
if d < '1860' or d > '1945':
print '%4s\t%s\t%s' % (d, r.photo_id(), mkurl(r))
print("%4s\t%s\t%s" % (d, r.id, r.url))
4 changes: 2 additions & 2 deletions analysis/ocr_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@ def get_back_id(photo_id):
if record.get('text'):
back_ids_with_text.add(back_id)

print 'Total records: %d' % len(back_ids)
print ' with text: %d' % len(back_ids_with_text)
print("Total records: %d" % len(back_ids))
print(" with text: %d" % len(back_ids_with_text))
43 changes: 0 additions & 43 deletions analysis/ocr_training_set.py

This file was deleted.

4 changes: 2 additions & 2 deletions analysis/rotations/extract_rotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ def histogram(lst):
all_feedback = json.load(open('../../feedback/user-feedback.json'))['feedback']

for photo_id, feedback in all_feedback.items():
if 'rotate' not in feedback: continue
if photo_id in BLACKLIST: continue
if ("rotate" not in feedback) or photo_id in BLACKLIST:
continue
rotations = feedback['rotate']

for rotation in rotations.values():
Expand Down
107 changes: 56 additions & 51 deletions cluster-locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
- look at before/after
"""

from collections import defaultdict
import fileinput

DISTANCE_THRESHOLD = 20
Expand All @@ -25,15 +24,16 @@
lat_lons = []
orig_points = 0
for line in fileinput.input():
line = line.strip()
if not line: continue
orig_points += 1
count, ll = line.split('\t')
lat, lon = [float(x) for x in ll.split(',')]
count = int(count)
line = line.strip()
if not line:
continue
orig_points += 1
count, ll = line.split("\t")
lat, lon = [float(x) for x in ll.split(",")]
count = int(count)

counts.append(count)
lat_lons.append((lat, lon))
counts.append(count)
lat_lons.append((lat, lon))


def UrlForIndex(idx):
Expand Down Expand Up @@ -65,44 +65,48 @@ def centroidForIndices(idxs):
# calculate all-pairs distances
nns = [] # index -> list of (distance, index) neighbors
for i in range(0, len(lat_lons)):
neighbors = [] # (dist, index)
a = lat_lons[i]
for j in range(i + 1, len(lat_lons)):
b = lat_lons[j]
d = dist(a, b)
if d > DISTANCE_THRESHOLD: continue
neighbors.append((-d, j))
neighbors.sort()
neighbors = [] # (dist, index)
a = lat_lons[i]
for j in range(i + 1, len(lat_lons)):
b = lat_lons[j]
d = dist(a, b)
if d > DISTANCE_THRESHOLD:
continue
neighbors.append((-d, j))
neighbors.sort()

nns.append([(-x[0], x[1]) for x in neighbors])
nns.append([(-x[0], x[1]) for x in neighbors])


# we hope there aren't any really degenerate cases
cluster_map = {} # idx -> cluster representative idx
for i, buds in enumerate(nns):
if not buds: continue

cluster_idx = i
if i in cluster_map: cluster_idx = cluster_map[i]
for d, j in buds:
if j in cluster_map:
if cluster_map[j] != cluster_idx:
old_idx = cluster_map[j]
for idx, rep in cluster_map.items():
if rep == old_idx: cluster_map[idx] = cluster_idx
cluster_map[old_idx] = cluster_idx
# this is pathological behavior; we artificially break the cluster
#a = j
#b = cluster_map[j]
#c = cluster_idx
#ll = lat_lons[a]
#print ' Current: %d = 0.000 %s %s' % (a, ll, UrlForIndex(b))
#print 'Old cluster: %d = %.3f %s %s' % (
# b, dist(ll, lat_lons[b]), lat_lons[b], UrlForIndex(b))
#print 'New cluster: %d = %.3f %s %s' % (
# c, dist(ll, lat_lons[c]), lat_lons[c], UrlForIndex(c))
#assert False
cluster_map[j] = cluster_idx
if not buds:
continue

cluster_idx = i
if i in cluster_map:
cluster_idx = cluster_map[i]
for d, j in buds:
if j in cluster_map:
if cluster_map[j] != cluster_idx:
old_idx = cluster_map[j]
for idx, rep in cluster_map.items():
if rep == old_idx:
cluster_map[idx] = cluster_idx
cluster_map[old_idx] = cluster_idx
# this is pathological behavior; we artificially break the cluster
# a = j
# b = cluster_map[j]
# c = cluster_idx
# ll = lat_lons[a]
# print ' Current: %d = 0.000 %s %s' % (a, ll, UrlForIndex(b))
# print 'Old cluster: %d = %.3f %s %s' % (
# b, dist(ll, lat_lons[b]), lat_lons[b], UrlForIndex(b))
# print 'New cluster: %d = %.3f %s %s' % (
# c, dist(ll, lat_lons[c]), lat_lons[c], UrlForIndex(c))
# assert False
cluster_map[j] = cluster_idx


clusters = {} # representative idx -> list of constituent indices
Expand All @@ -121,13 +125,14 @@ def centroidForIndices(idxs):


if output_mode == 'urls':
num_points = 0
for base, members in clusters.items():
if not members: continue
print('(%d)' % len(members))
for i in members:
print(' %s' % UrlForIndex(i))
print()
num_points += len(members)

print('%d clusters, %d/%d points' % (len(clusters), num_points, orig_points))
num_points = 0
for base, members in clusters.items():
if not members:
continue
print("(%d)" % len(members))
for i in members:
print(" %s" % UrlForIndex(i))
print()
num_points += len(members)

print('%d clusters, %d/%d points' % (len(clusters), num_points, orig_points))
5 changes: 2 additions & 3 deletions coders/extended_grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

import coders.registration
from data.item import Item, blank_item
import record
from grid import coder


Expand Down Expand Up @@ -61,13 +60,13 @@ def parse_street_ave(street1: str, street2: str) -> tuple[str, str]:

# pull the number from the street string
num = extract_ordinal(street2)
if num == None:
if num is None:
raise ValueError('Unable to find a number in %s' % street2)
street2 = num

# Try the same for the avenue
num = extract_ordinal(street1)
if num != None:
if num is not None:
street1 = str(num)
else:
# Look for something like 'Avenue A'
Expand Down
2 changes: 1 addition & 1 deletion coders/nyc_parks.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def codeRecord(self, r: Item):
bridge = m.group(1)
# if not ('Bridge' in bridge or 'bridge' in bridge):
# XXX this is weird
if not "Bridge" in bridge or "bridge" in bridge:
if not "Bridge" in bridge or "bridge" in bridge: # noqa: E713
bridge += " Bridge"
if bridge not in bridges:
missing_bridges[bridge] += 1
Expand Down
2 changes: 1 addition & 1 deletion generate_js.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def printRecordsJson(located_recs: list[LocatedRecord]):

# TODO: remove this
try:
x = json.dumps(rec)
json.dumps(rec)
except Exception as e:
sys.stderr.write('%s\n' % rec)
raise e
Expand Down
Loading
Loading