danvk · danvk · Oct 14, 2024 · Oct 14, 2024 · Oct 14, 2024 · Oct 14, 2024
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -28,3 +28,6 @@ jobs:
     - name: Run tests
       run: |
         poetry run pytest
+    - name: Run ruff
+      run: |
+        poetry run ruff check
diff --git a/analysis/analyze_movement.py b/analysis/analyze_movement.py
diff --git a/analysis/extreme_dates.py b/analysis/extreme_dates.py
@@ -7,27 +7,16 @@
 See https://github.com/danvk/oldnyc/issues/3
 '''
 
-import os, sys
-parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-sys.path.insert(0,parentdir) 
 import re
-
-import record
+from data.item import load_items
 
 def extract_dates(date_str):
     return re.findall(r'\b(1[6789]\d\d)\b', date_str)
 
 
-def mkurl(r):
-    return 'http://digitalcollections.nypl.org/items/image_id/%s' % (
-            re.sub(r'-[a-z]$', '', r.photo_id()))
-
-
 if __name__ == '__main__':
-    rs = record.AllRecords('nyc/records.pickle')
+    rs = load_items("data/images.ndjson")
     for r in rs:
-        dstr = re.sub(r'\s+', ' ', r.date())
-        if not dstr: continue
-        for d in extract_dates(dstr):
+        for d in extract_dates(r.date or ""):
             if d < '1860' or d > '1945':
-                print '%4s\t%s\t%s' % (d, r.photo_id(), mkurl(r))
+                print("%4s\t%s\t%s" % (d, r.id, r.url))
diff --git a/analysis/ocr_counts.py b/analysis/ocr_counts.py
@@ -16,5 +16,5 @@ def get_back_id(photo_id):
     if record.get('text'):
         back_ids_with_text.add(back_id)
 
-print 'Total records: %d' % len(back_ids)
-print '    with text: %d' % len(back_ids_with_text)
+print("Total records: %d" % len(back_ids))
+print("    with text: %d" % len(back_ids_with_text))
diff --git a/analysis/ocr_training_set.py b/analysis/ocr_training_set.py
diff --git a/analysis/rotations/extract_rotations.py b/analysis/rotations/extract_rotations.py
@@ -48,8 +48,8 @@ def histogram(lst):
 all_feedback = json.load(open('../../feedback/user-feedback.json'))['feedback']
 
 for photo_id, feedback in all_feedback.items():
-    if 'rotate' not in feedback: continue
-    if photo_id in BLACKLIST: continue
+    if ("rotate" not in feedback) or photo_id in BLACKLIST:
+        continue
     rotations = feedback['rotate']
 
     for rotation in rotations.values():

diff --git a/cluster-locations.py b/cluster-locations.py
@@ -14,7 +14,6 @@
 - look at before/after
 """
 
-from collections import defaultdict
 import fileinput
 
 DISTANCE_THRESHOLD = 20
@@ -25,15 +24,16 @@
 lat_lons = []
 orig_points = 0
 for line in fileinput.input():
-  line = line.strip()
-  if not line: continue
-  orig_points += 1
-  count, ll = line.split('\t')
-  lat, lon = [float(x) for x in ll.split(',')]
-  count = int(count)
+    line = line.strip()
+    if not line:
+        continue
+    orig_points += 1
+    count, ll = line.split("\t")
+    lat, lon = [float(x) for x in ll.split(",")]
+    count = int(count)
 
-  counts.append(count)
-  lat_lons.append((lat, lon))
+    counts.append(count)
+    lat_lons.append((lat, lon))
 
 
 def UrlForIndex(idx):
@@ -65,44 +65,48 @@ def centroidForIndices(idxs):
 # calculate all-pairs distances
 nns = []  # index -> list of (distance, index) neighbors
 for i in range(0, len(lat_lons)):
-  neighbors = []  # (dist, index)
-  a = lat_lons[i]
-  for j in range(i + 1, len(lat_lons)):
-    b = lat_lons[j]
-    d = dist(a, b)
-    if d > DISTANCE_THRESHOLD: continue
-    neighbors.append((-d, j))
-  neighbors.sort()
+    neighbors = []  # (dist, index)
+    a = lat_lons[i]
+    for j in range(i + 1, len(lat_lons)):
+        b = lat_lons[j]
+        d = dist(a, b)
+        if d > DISTANCE_THRESHOLD:
+            continue
+        neighbors.append((-d, j))
+    neighbors.sort()
 
-  nns.append([(-x[0], x[1]) for x in neighbors])
+    nns.append([(-x[0], x[1]) for x in neighbors])
 
 
 # we hope there aren't any really degenerate cases
 cluster_map = {}    # idx -> cluster representative idx
 for i, buds in enumerate(nns):
-  if not buds: continue
-
-  cluster_idx = i
-  if i in cluster_map: cluster_idx = cluster_map[i]
-  for d, j in buds:
-    if j in cluster_map:
-      if cluster_map[j] != cluster_idx:
-        old_idx = cluster_map[j]
-        for idx, rep in cluster_map.items():
-          if rep == old_idx: cluster_map[idx] = cluster_idx
-        cluster_map[old_idx] = cluster_idx
-        # this is pathological behavior; we artificially break the cluster
-        #a = j
-        #b = cluster_map[j]
-        #c = cluster_idx
-        #ll = lat_lons[a]
-        #print '    Current: %d = 0.000 %s %s' % (a, ll, UrlForIndex(b))
-        #print 'Old cluster: %d = %.3f %s %s' % (
-        #    b, dist(ll, lat_lons[b]), lat_lons[b], UrlForIndex(b))
-        #print 'New cluster: %d = %.3f %s %s' % (
-        #    c, dist(ll, lat_lons[c]), lat_lons[c], UrlForIndex(c))
-        #assert False
-    cluster_map[j] = cluster_idx
+    if not buds:
+        continue
+
+    cluster_idx = i
+    if i in cluster_map:
+        cluster_idx = cluster_map[i]
+    for d, j in buds:
+        if j in cluster_map:
+            if cluster_map[j] != cluster_idx:
+                old_idx = cluster_map[j]
+                for idx, rep in cluster_map.items():
+                    if rep == old_idx:
+                        cluster_map[idx] = cluster_idx
+                cluster_map[old_idx] = cluster_idx
+                # this is pathological behavior; we artificially break the cluster
+                # a = j
+                # b = cluster_map[j]
+                # c = cluster_idx
+                # ll = lat_lons[a]
+                # print '    Current: %d = 0.000 %s %s' % (a, ll, UrlForIndex(b))
+                # print 'Old cluster: %d = %.3f %s %s' % (
+                #    b, dist(ll, lat_lons[b]), lat_lons[b], UrlForIndex(b))
+                # print 'New cluster: %d = %.3f %s %s' % (
+                #    c, dist(ll, lat_lons[c]), lat_lons[c], UrlForIndex(c))
+                # assert False
+        cluster_map[j] = cluster_idx
 
 
 clusters = {}  # representative idx -> list of constituent indices
@@ -121,13 +125,14 @@ def centroidForIndices(idxs):
 
 
 if output_mode == 'urls':
-  num_points = 0
-  for base, members in clusters.items():
-    if not members: continue
-    print('(%d)' % len(members))
-    for i in members:
-      print('  %s' % UrlForIndex(i))
-    print()
-    num_points += len(members)
-
-  print('%d clusters, %d/%d points' % (len(clusters), num_points, orig_points))
+    num_points = 0
+    for base, members in clusters.items():
+        if not members:
+            continue
+        print("(%d)" % len(members))
+        for i in members:
+            print("  %s" % UrlForIndex(i))
+        print()
+        num_points += len(members)
+
+    print('%d clusters, %d/%d points' % (len(clusters), num_points, orig_points))
diff --git a/coders/extended_grid.py b/coders/extended_grid.py
@@ -17,7 +17,6 @@
 
 import coders.registration
 from data.item import Item, blank_item
-import record
 from grid import coder
 
 
@@ -61,13 +60,13 @@ def parse_street_ave(street1: str, street2: str) -> tuple[str, str]:
 
     # pull the number from the street string
     num = extract_ordinal(street2)
-    if num == None:
+    if num is None:
         raise ValueError('Unable to find a number in %s' % street2)
     street2 = num
 
     # Try the same for the avenue
     num = extract_ordinal(street1)
-    if num != None:
+    if num is not None:
         street1 = str(num)
     else:
         # Look for something like 'Avenue A'

diff --git a/coders/nyc_parks.py b/coders/nyc_parks.py
@@ -264,7 +264,7 @@ def codeRecord(self, r: Item):
             bridge = m.group(1)
             # if not ('Bridge' in bridge or 'bridge' in bridge):
             # XXX this is weird
-            if not "Bridge" in bridge or "bridge" in bridge:
+            if not "Bridge" in bridge or "bridge" in bridge:  # noqa: E713
                 bridge += " Bridge"
             if bridge not in bridges:
                 missing_bridges[bridge] += 1

diff --git a/generate_js.py b/generate_js.py
@@ -139,7 +139,7 @@ def printRecordsJson(located_recs: list[LocatedRecord]):
 
         # TODO: remove this
         try:
-            x = json.dumps(rec)
+            json.dumps(rec)
         except Exception as e:
             sys.stderr.write('%s\n' % rec)
             raise e