Merge pull request #36 from georgetown-cset/35-cloud-composer-2-and-s…

…ensors Move to cloud composer 2; better handle long-running tasks
georgetown-cset · Mar 1, 2024 · a743d50 · a743d50
2 parents b159349 + 05a076c
commit a743d50
Show file tree

Hide file tree

Showing 77 changed files with 1,481 additions and 784 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,5 @@
+[flake8]
+ignore = E203, E266, E501, W503, F403, F401
+max-line-length = 120
+max-complexity = 20
+select = B,C,E,F,W,T4,B9
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -0,0 +1,75 @@
+name: Python application
+
+on: [pull_request]
+
+jobs:
+  build:
+    name: tests-pass
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python 3.7
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.7
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+    - name: Test with pytest
+      run: |
+        coverage run -m pytest tests
+        coverage xml -o coverage/python.xml
+    - name: Report python coverage
+      uses: orgoro/coverage@v3
+      with:
+        coverageFile: coverage/python.xml
+        token: ${{ secrets.GITHUB_TOKEN }}
+    # The next few steps only apply if you have javascript files
+    # - name: Setup node
+    #   uses: actions/setup-node@v3
+    #   with:
+    #     node-version: '18'
+    # - name: Test with jest
+    #   shell: bash
+    #   run: |
+    #     npm install
+    #     npm test -- --coverage --coverageReporters="json-summary" --coverageReporters="text" | tee ./coverage.txt
+    #   shell: bash
+    # - name: Report javascript coverage
+    #   uses: MishaKav/jest-coverage-comment@v1.0.20
+    #   with:
+    #     title: "JavaScript Coverage"
+    #     summary-title: "Summary"
+    #     coverage-title: "Modified Files"
+    #     github-token: ${{ secrets.GITHUB_TOKEN }}
+    #     report-only-changed-files: true
+    #     coverage-path: ./JS-FOLDER-NAME/coverage.txt
+    #     coverage-summary-path: ./JS-FOLDER-NAME/coverage/coverage-summary.json
+    #     coverage-path-prefix: JS-FOLDER-NAME/src/
+    # - name: Build output files
+    #   run: |
+    #     npm run build
+    # - name: Check links in built files
+    #   id: link_check
+    #   run: |
+    #     find public -name "*.js" -exec grep -Eo "(http|https):\/\/[^]\{\}\"'\\\(\)\> ]+" {} \; | sort -u > linklist.txt
+    #     printf '%s\n%s\n%s\n' "# LinkChecker URL list" "# <meta charset=\"UTF-8\">" "$(cat linklist.txt)" > linklist.txt
+    #     linkchecker linklist.txt --check-extern --ignore-url="https://.*\.fastly\.net/.*" --ignore-url="https://.*\.mapbox\..*" --ignore-url=".*//a\W.*" --ignore-url="http://(a|x|тест)" -o failures > output.txt || true
+    #     cat output.txt
+    #     echo "num_links=$(wc -l < output.txt | sed 's/^ *//g')" >> $GITHUB_OUTPUT
+    #     echo "links<<EOFdelimiter" >> $GITHUB_OUTPUT
+    #     echo "$(cat output.txt)" >> $GITHUB_OUTPUT
+    #     echo "EOFdelimiter" >> $GITHUB_OUTPUT
+    # - name: Edit PR comment about link checking
+    #   if: steps.link_check.outputs.num_links > 0
+    #   uses: thollander/actions-comment-pull-request@v2
+    #   with:
+    #     message: |
+    #       There are ${{ steps.link_check.outputs.num_links }} broken links. Check the code for these links:
+    #       ${{ steps.link_check.outputs.links }}
+    #     comment_tag: link_check_msg
+    - name: Run linting
+      run: |
+        pre-commit run --all-files
diff --git a/.github/workflows/rebase-reminder.yml b/.github/workflows/rebase-reminder.yml
@@ -0,0 +1,42 @@
+name: Rebase reminder
+on: [pull_request, pull_request_review]
+
+jobs:
+  build:
+    name: rebuild-reminder
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+      with:
+        fetch-depth: 0
+    - name: Find behind count
+      id: behind_count
+      run: |
+        echo "behind_count=$(git rev-list --count ${{ github.event.pull_request.head.sha }}..${{ github.event.pull_request.base.sha }})" >> $GITHUB_OUTPUT
+    - name: Find ahead count
+      id: ahead_count
+      run: |
+        echo "ahead_count=$(git rev-list --count ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }})" >> $GITHUB_OUTPUT
+    - name: Find combined count
+      id: combined_count
+      run: |
+        echo "combined_count=$(expr ${{steps.behind_count.outputs.behind_count}} + ${{steps.ahead_count.outputs.ahead_count}})" >> $GITHUB_OUTPUT
+    - name: Edit PR comment - rebasing
+      if: steps.behind_count.outputs.behind_count > 0 && steps.combined_count.outputs.combined_count > 3
+      uses: thollander/actions-comment-pull-request@v1
+      with:
+        message: |
+          Needs rebasing :bangbang:
+          behind_count is ${{ steps.behind_count.outputs.behind_count }}
+          ahead_count is ${{ steps.ahead_count.outputs.ahead_count }}
+        comment_includes: 'rebasing'
+    - name: Edit PR comment - no rebasing
+      if: steps.behind_count.outputs.behind_count == 0 || steps.combined_count.outputs.combined_count <= 3
+      uses: thollander/actions-comment-pull-request@v1
+      with:
+        message: |
+          No need for rebasing :+1:
+          behind_count is ${{ steps.behind_count.outputs.behind_count }}
+          ahead_count is ${{ steps.ahead_count.outputs.ahead_count }}
+        comment_includes: 'rebasing'
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,37 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.1.0
+    hooks:
+    -   id: trailing-whitespace
+        exclude: "__snapshots__"
+    -   id: end-of-file-fixer
+    -   id: check-yaml
+    -   id: check-added-large-files
+    -   id: check-json
+# The next step only applies if you have javascript files.
+# There should be a package.json that installs eslint
+# (or eslint-config-react-app if you are using gatsby).
+# -   repo: https://github.com/pre-commit/mirrors-eslint
+#     rev: v8.24.0
+#     hooks:
+#     -   id: eslint
+-   repo: https://github.com/PyCQA/isort
+    rev: 5.11.5
+    hooks:
+    -   id: isort
+-   repo: https://github.com/ambv/black
+    rev: 22.3.0
+    hooks:
+    -   id: black
+        language_version: python3
+-   repo: https://github.com/PyCQA/flake8
+    rev: 4.0.1
+    hooks:
+    -   id: flake8
+-   repo: https://github.com/sqlfluff/sqlfluff
+    rev: 0.10.1
+    hooks:
+    -   id: sqlfluff-lint
+    -   id: sqlfluff-fix
diff --git a/.sqlfluff b/.sqlfluff
@@ -0,0 +1,19 @@
+[sqlfluff]
+dialect=bigquery
+indent_unit = space
+exclude_rules = L014,L018,L027,L032,L034,L042,L044,L031
+
+[sqlfluff:rules]
+max_line_length = 120
+tab_space_size = 2
+
+[sqlfluff:rules:L010]
+capitalisation_policy = upper
+
+[sqlfluff:rules:L030]
+extended_capitalisation_policy = upper
+
+[sqlfluff:templater:jinja:context]
+staging_dataset = staging_literature
+production_dataset = literature
+params = {"strong": "title", "other": "year", "additional_checks": "", "tables": "staging_literature.table"}
diff --git a/.sqlfluffignore b/.sqlfluffignore
@@ -0,0 +1,2 @@
+evaluation/
+tests/
diff --git a/README.md b/README.md
@@ -1,15 +1,15 @@
 # Article Linking
 ![Python application](https://github.com/georgetown-cset/article-linking/workflows/Python%20application/badge.svg)
 
-This repository contains a description and supporting code for CSET's current method of 
-cross-dataset article linking. Note that we use "article" very loosely, although in a way that to our knowledge 
+This repository contains a description and supporting code for CSET's current method of
+cross-dataset article linking. Note that we use "article" very loosely, although in a way that to our knowledge
 is fairly consistent across corpora. Books, for example, are included.
 
-For each article in arXiv, WOS, Papers With Code, Semantic Scholar, The Lens, and OpenAlex 
+For each article in arXiv, WOS, Papers With Code, Semantic Scholar, The Lens, and OpenAlex
 we normalized titles, abstracts, and author last names. For the purpose of matching, we filtered out
-titles, abstracts, and DOIs that occurred more than 10 times in the corpus. We then considered each group of articles 
+titles, abstracts, and DOIs that occurred more than 10 times in the corpus. We then considered each group of articles
 within or across datasets that shared at least one of the following (non-null) metadata fields:
- 
+
 *  Normalized title
 *  Normalized abstract
 *  Citations
@@ -19,8 +19,8 @@ as well as a match on one additional field above, or on
 
 *  Publication year
 *  Normalized author last names
- 
-to correspond to one article in the merged dataset. We add to this set "near matches" of the concatenation 
+
+to correspond to one article in the merged dataset. We add to this set "near matches" of the concatenation
 of the normalized title and abstract within a publication year, which we identify using simhash.
 
 To do this, we run the `linkage_dag.py` on airflow. The article linkage runs weekly, triggered by the `scholarly_lit_trigger` dag.
@@ -31,15 +31,15 @@ For an English description of what the dag does, see [the documentation](methods
 
 We have three tables that are most likely to help you use article linkage.
 
-- `gcp_cset_links_v2.article_links` - For each original ID (e.g., from WoS), gives the corresponding CSET ID. 
+- `gcp_cset_links_v2.article_links` - For each original ID (e.g., from WoS), gives the corresponding CSET ID.
 This is a many-to-one mapping. Please update your scripts to use `gcp_cset_links_v2.article_links_with_dataset`,
 which has an additional column that contains the dataset of the `orig_id`.
 
 - `gcp_cset_links_v2.all_metadata_with_cld2_lid` - provides CLD2 LID for the titles and abstracts of each
-current version of each article's metadata. You can also use this table to get the metadata used in the 
+current version of each article's metadata. You can also use this table to get the metadata used in the
 match for each version of the raw articles. Note that the `id` column is _not_ unique as some corpora like WOS
 have multiple versions of the metadata for different languages.
 
 - `gcp_cset_links_v2.article_merged_metadata` - This maps the CSET `merged_id` to a set of merged metadata.
-The merging method takes the maximum value of each metadata field across each matched article, which may not 
+The merging method takes the maximum value of each metadata field across each matched article, which may not
 be suitable for your purposes.
diff --git a/evaluation/positive_match_no_simhash_for_annotators.sql b/evaluation/positive_match_no_simhash_for_annotators.sql
@@ -1,11 +1,11 @@
-select 
+select
   merged_id,
   data1.orig_id as orig_id1,
   data2.orig_id as orig_id2,
   data1.metadata as metadata1,
   data2.metadata as metadata2
 from article_links_v3_eval.positive_match_no_simhash as data1
-inner join 
+inner join
 article_links_v3_eval.positive_match_no_simhash as data2
 using(merged_id)
-where data1.orig_id > data2.orig_id -- avoid having annotators annotate twice
+where data1.orig_id > data2.orig_id -- avoid having annotators annotate twice