Remove metadata we're holding until v2

georgetown-cset · Apr 22, 2024 · 1d703f6 · 1d703f6
1 parent a7784df
commit 1d703f6
Show file tree

Hide file tree

Showing 4 changed files with 73 additions and 95 deletions.
diff --git a/README.md b/README.md
@@ -14,13 +14,6 @@ Our article linkage pipeline [generates language ID labels](https://github.com/g
 for titles and abstracts using [PYCLD2](https://pypi.org/project/pycld2/). We only include language IDs where PYCLD2
 successfully output a language and marked the output as reliable.
 
-### Fields of Study
-
-We include the top three level 1 field of study labels generated using the method described in
-[Multi-label Classification of Scientific Research Documents Across Domains and Languages](https://aclanthology.org/2022.sdp-1.12) (Toney & Dunham, sdp 2022).
-We only include these labels for records with a non-null English title and abstract (as detected by PYCLD2), where the
-abstract is over 500 words in length.
-
 ### Subject relevance predictions
 
 We share outputs for subject classifiers (for more information on how these classifiers were trained
@@ -31,7 +24,7 @@ in the following fields:
 * `is_nlp` - True if a natural language processing classifier predicted the work was relevant
 * `is_ro` - True if a robotics classifier predicted the work was relevant
 * `is_ai` - True if an artificial intelligence classifier predicted the work was relevant, or if any of the computer vision, natural language processing, or robotics classifiers predicted the work was relevant
-* `is_ai_safety` - True if the artificial intelligence classifier predicted the work was relevant to AI, and a separate AI safety classifier also predicted the work was relevant to AI safety
+* `is_cyber` - True if a cybersecurity classifier predicted the work was relevant
 
 ## Updating the dataset
 

diff --git a/cset_openalex_augmentation_dag.py b/cset_openalex_augmentation_dag.py
@@ -51,7 +51,7 @@
     run_dir = "current_run"
     public_bucket = "mos-static"
     table = "metadata"
-    gce_resource_id = "cset-openalex-updates"
+    gce_resource_id = "cset-openalex-updater"
 
     clear_tmp_dir = GCSDeleteObjectsOperator(
         task_id="clear_tmp_dir", bucket_name=DATA_BUCKET, prefix=tmp_dir

diff --git a/schemas/metadata.json b/schemas/metadata.json
@@ -14,12 +14,6 @@
     "type": "STRING",
     "description": "Language of abstract, as detected by pycld2"
   },
-  {
-    "mode": "REPEATED",
-    "name": "top_level1_fields",
-    "type": "STRING",
-    "description": "Top three level 1 fields of study, using method in https://aclanthology.org/2022.sdp-1.12/"
-  },
   {
     "name": "is_ai",
     "type": "BOOLEAN",
@@ -39,10 +33,5 @@
     "name": "is_robotics",
     "type": "BOOLEAN",
     "description": "True if a CSET classifier predicted this work was relevant to AI, False if the classifier predicted that this work was not relevant to Robotics, and null if the classifier was not run on this work (i.e. if the work had a non-English or null title and abstract)."
-  },
-  {
-    "name": "is_ai_safety",
-    "type": "BOOLEAN",
-    "description": "True if a CSET classifier predicted this work was relevant to AI Safety, False if the classifier predicted that this work was not relevant to AI, and null if the classifier was not run on this work. This classifier was only run on works that were predicted relevant to AI by the AI classifier (see `is_ai`)"
   }
 ]
diff --git a/sql/metadata.sql b/sql/metadata.sql
@@ -1,83 +1,81 @@
 WITH
-field_name_scores AS (
-  SELECT
-    merged_id,
-    name,
-    field.score AS score
-  FROM
-    fields_of_study_v2.field_scores
-  CROSS JOIN
-    UNNEST(fields) AS field
-  LEFT JOIN
-    fields_of_study_v2.field_meta
-    ON
-      field_id = field.id
-  WHERE
-    (level = 1)),
-
-field_order AS (
-  SELECT
-    merged_id,
-    name,
-    score,
-    ROW_NUMBER() OVER(PARTITION BY merged_id ORDER BY score DESC) AS row_num
-  FROM
-    field_name_scores),
-
-top_fields AS (
-  SELECT
-    merged_id,
-    ARRAY_AGG(name ORDER BY score DESC) AS top_level1_fields
-  FROM
-    field_order
-  WHERE
-    (
-      row_num < 4
-    ) AND (
-      merged_id IN (
-        SELECT merged_id
-        FROM
-          literature.papers
-        WHERE
-          (
-            title_english IS NOT NULL
-          ) AND (abstract_english IS NOT NULL) AND (LENGTH(abstract_english) > 500) AND (year > 2010)
-      )
-    )
-  GROUP BY merged_id
-),
+-- field_name_scores AS (
+--   SELECT
+--     merged_id,
+--     name,
+--     field.score AS score
+--   FROM
+--     fields_of_study_v2.field_scores
+--   CROSS JOIN
+--     UNNEST(fields) AS field
+--   LEFT JOIN
+--     fields_of_study_v2.field_meta
+--     ON
+--       field_id = field.id
+--   WHERE
+--     (level = 1)),
+--
+-- field_order AS (
+--   SELECT
+--     merged_id,
+--     name,
+--     score,
+--     ROW_NUMBER() OVER(PARTITION BY merged_id ORDER BY score DESC) AS row_num
+--   FROM
+--     field_name_scores),
+--
+-- top_fields AS (
+--   SELECT
+--     merged_id,
+--     ARRAY_AGG(name ORDER BY score DESC) AS top_level1_fields
+--   FROM
+--     field_order
+--   WHERE
+--     (
+--       row_num < 4
+--     ) AND (
+--       merged_id IN (
+--         SELECT merged_id
+--         FROM
+--           literature.papers
+--         WHERE
+--           (
+--             title_english IS NOT NULL
+--           ) AND (abstract_english IS NOT NULL) AND (LENGTH(abstract_english) > 500) AND (year > 2010)
+--       )
+--     )
+--   GROUP BY merged_id
+-- ),
 
 ai_pubs AS (
   SELECT
     orig_id,
     ai OR nlp OR cv OR robotics AS is_ai,
     nlp AS is_nlp,
     cv AS is_cv,
-    robotics AS is_robotics
+    robotics AS is_robotics,
+    cyber AS is_cyber
   FROM
     openalex_article_classification.predictions
-  WHERE
-    ai IS TRUE
-    OR nlp IS TRUE
-    OR cv IS TRUE
-    OR robotics IS TRUE
 ),
 
-ai_safety_pubs AS (
-  SELECT
-    orig_id,
-    preds_str AS is_ai_safety
-  FROM
-    ai_safety_openalex.ai_safety_predictions
-),
+-- ai_safety_pubs AS (
+--   SELECT
+--     orig_id,
+--     preds_str AS is_ai_safety
+--   FROM
+--     ai_safety_openalex.ai_safety_predictions
+-- ),
 
 language_id AS (
   SELECT DISTINCT
     id,
-    IF(title_cld2_lid_success AND title_cld2_lid_is_reliable, title_cld2_lid_first_result, NULL) AS title_language,
-    IF(
+    LOWER(
+      IF(title_cld2_lid_success AND title_cld2_lid_is_reliable, title_cld2_lid_first_result, NULL)
+    ) AS title_language,
+    LOWER(IF(
       abstract_cld2_lid_success AND abstract_cld2_lid_is_reliable, abstract_cld2_lid_first_result, NULL
-    ) AS abstract_language
+    )) AS abstract_language
   FROM
     staging_literature.all_metadata_with_cld2_lid
 )
@@ -86,26 +84,24 @@ SELECT
   id,
   title_language,
   abstract_language,
-  top_level1_fields,
+  --  top_level1_fields,
   is_ai,
   is_nlp,
   is_cv,
   is_robotics,
-  is_ai_safety
+  is_cyber
+--  is_ai_safety
 FROM
   openalex.works
-INNER JOIN
-  literature.sources
-  ON id = orig_id
-LEFT JOIN
-  top_fields
-  USING (merged_id)
+-- LEFT JOIN
+--   top_fields
+--   USING (merged_id)
 LEFT JOIN
   ai_pubs
-  ON id = orig_id
-LEFT JOIN
-  ai_safety_pubs
-  ON id = orig_id
+  ON id = ai_pubs.orig_id
+-- LEFT JOIN
+--   ai_safety_pubs
+--   ON id = orig_id
 LEFT JOIN
   language_id
   USING (id)