diff --git a/README.md b/README.md index 1d9fb5f..836d35a 100644 --- a/README.md +++ b/README.md @@ -14,13 +14,6 @@ Our article linkage pipeline [generates language ID labels](https://github.com/g for titles and abstracts using [PYCLD2](https://pypi.org/project/pycld2/). We only include language IDs where PYCLD2 successfully output a language and marked the output as reliable. -### Fields of Study - -We include the top three level 1 field of study labels generated using the method described in -[Multi-label Classification of Scientific Research Documents Across Domains and Languages](https://aclanthology.org/2022.sdp-1.12) (Toney & Dunham, sdp 2022). -We only include these labels for records with a non-null English title and abstract (as detected by PYCLD2), where the -abstract is over 500 words in length. - ### Subject relevance predictions We share outputs for subject classifiers (for more information on how these classifiers were trained @@ -31,7 +24,7 @@ in the following fields: * `is_nlp` - True if a natural language processing classifier predicted the work was relevant * `is_ro` - True if a robotics classifier predicted the work was relevant * `is_ai` - True if an artificial intelligence classifier predicted the work was relevant, or if any of the computer vision, natural language processing, or robotics classifiers predicted the work was relevant -* `is_ai_safety` - True if the artificial intelligence classifier predicted the work was relevant to AI, and a separate AI safety classifier also predicted the work was relevant to AI safety +* `is_cyber` - True if a cybersecurity classifier predicted the work was relevant ## Updating the dataset diff --git a/cset_openalex_augmentation_dag.py b/cset_openalex_augmentation_dag.py index f288c2c..7dd2631 100644 --- a/cset_openalex_augmentation_dag.py +++ b/cset_openalex_augmentation_dag.py @@ -51,7 +51,7 @@ run_dir = "current_run" public_bucket = "mos-static" table = "metadata" - gce_resource_id = "cset-openalex-updates" + gce_resource_id = "cset-openalex-updater" clear_tmp_dir = GCSDeleteObjectsOperator( task_id="clear_tmp_dir", bucket_name=DATA_BUCKET, prefix=tmp_dir diff --git a/schemas/metadata.json b/schemas/metadata.json index 367cc56..eb342ee 100644 --- a/schemas/metadata.json +++ b/schemas/metadata.json @@ -14,12 +14,6 @@ "type": "STRING", "description": "Language of abstract, as detected by pycld2" }, - { - "mode": "REPEATED", - "name": "top_level1_fields", - "type": "STRING", - "description": "Top three level 1 fields of study, using method in https://aclanthology.org/2022.sdp-1.12/" - }, { "name": "is_ai", "type": "BOOLEAN", @@ -39,10 +33,5 @@ "name": "is_robotics", "type": "BOOLEAN", "description": "True if a CSET classifier predicted this work was relevant to AI, False if the classifier predicted that this work was not relevant to Robotics, and null if the classifier was not run on this work (i.e. if the work had a non-English or null title and abstract)." - }, - { - "name": "is_ai_safety", - "type": "BOOLEAN", - "description": "True if a CSET classifier predicted this work was relevant to AI Safety, False if the classifier predicted that this work was not relevant to AI, and null if the classifier was not run on this work. This classifier was only run on works that were predicted relevant to AI by the AI classifier (see `is_ai`)" } ] diff --git a/sql/metadata.sql b/sql/metadata.sql index 6e529d5..93542d2 100644 --- a/sql/metadata.sql +++ b/sql/metadata.sql @@ -1,51 +1,51 @@ WITH -field_name_scores AS ( - SELECT - merged_id, - name, - field.score AS score - FROM - fields_of_study_v2.field_scores - CROSS JOIN - UNNEST(fields) AS field - LEFT JOIN - fields_of_study_v2.field_meta - ON - field_id = field.id - WHERE - (level = 1)), - -field_order AS ( - SELECT - merged_id, - name, - score, - ROW_NUMBER() OVER(PARTITION BY merged_id ORDER BY score DESC) AS row_num - FROM - field_name_scores), - -top_fields AS ( - SELECT - merged_id, - ARRAY_AGG(name ORDER BY score DESC) AS top_level1_fields - FROM - field_order - WHERE - ( - row_num < 4 - ) AND ( - merged_id IN ( - SELECT merged_id - FROM - literature.papers - WHERE - ( - title_english IS NOT NULL - ) AND (abstract_english IS NOT NULL) AND (LENGTH(abstract_english) > 500) AND (year > 2010) - ) - ) - GROUP BY merged_id -), +-- field_name_scores AS ( +-- SELECT +-- merged_id, +-- name, +-- field.score AS score +-- FROM +-- fields_of_study_v2.field_scores +-- CROSS JOIN +-- UNNEST(fields) AS field +-- LEFT JOIN +-- fields_of_study_v2.field_meta +-- ON +-- field_id = field.id +-- WHERE +-- (level = 1)), +-- +-- field_order AS ( +-- SELECT +-- merged_id, +-- name, +-- score, +-- ROW_NUMBER() OVER(PARTITION BY merged_id ORDER BY score DESC) AS row_num +-- FROM +-- field_name_scores), +-- +-- top_fields AS ( +-- SELECT +-- merged_id, +-- ARRAY_AGG(name ORDER BY score DESC) AS top_level1_fields +-- FROM +-- field_order +-- WHERE +-- ( +-- row_num < 4 +-- ) AND ( +-- merged_id IN ( +-- SELECT merged_id +-- FROM +-- literature.papers +-- WHERE +-- ( +-- title_english IS NOT NULL +-- ) AND (abstract_english IS NOT NULL) AND (LENGTH(abstract_english) > 500) AND (year > 2010) +-- ) +-- ) +-- GROUP BY merged_id +-- ), ai_pubs AS ( SELECT @@ -53,31 +53,29 @@ ai_pubs AS ( ai OR nlp OR cv OR robotics AS is_ai, nlp AS is_nlp, cv AS is_cv, - robotics AS is_robotics + robotics AS is_robotics, + cyber AS is_cyber FROM openalex_article_classification.predictions - WHERE - ai IS TRUE - OR nlp IS TRUE - OR cv IS TRUE - OR robotics IS TRUE ), -ai_safety_pubs AS ( - SELECT - orig_id, - preds_str AS is_ai_safety - FROM - ai_safety_openalex.ai_safety_predictions -), +-- ai_safety_pubs AS ( +-- SELECT +-- orig_id, +-- preds_str AS is_ai_safety +-- FROM +-- ai_safety_openalex.ai_safety_predictions +-- ), language_id AS ( SELECT DISTINCT id, - IF(title_cld2_lid_success AND title_cld2_lid_is_reliable, title_cld2_lid_first_result, NULL) AS title_language, - IF( + LOWER( + IF(title_cld2_lid_success AND title_cld2_lid_is_reliable, title_cld2_lid_first_result, NULL) + ) AS title_language, + LOWER(IF( abstract_cld2_lid_success AND abstract_cld2_lid_is_reliable, abstract_cld2_lid_first_result, NULL - ) AS abstract_language + )) AS abstract_language FROM staging_literature.all_metadata_with_cld2_lid ) @@ -86,26 +84,24 @@ SELECT id, title_language, abstract_language, - top_level1_fields, + -- top_level1_fields, is_ai, is_nlp, is_cv, is_robotics, - is_ai_safety + is_cyber +-- is_ai_safety FROM openalex.works -INNER JOIN - literature.sources - ON id = orig_id -LEFT JOIN - top_fields - USING (merged_id) +-- LEFT JOIN +-- top_fields +-- USING (merged_id) LEFT JOIN ai_pubs - ON id = orig_id -LEFT JOIN - ai_safety_pubs - ON id = orig_id + ON id = ai_pubs.orig_id +-- LEFT JOIN +-- ai_safety_pubs +-- ON id = orig_id LEFT JOIN language_id USING (id)