Skip to content

Commit

Permalink
Remove metadata we're holding until v2
Browse files Browse the repository at this point in the history
  • Loading branch information
jmelot committed Apr 22, 2024
1 parent a7784df commit 1d703f6
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 95 deletions.
9 changes: 1 addition & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,6 @@ Our article linkage pipeline [generates language ID labels](https://github.com/g
for titles and abstracts using [PYCLD2](https://pypi.org/project/pycld2/). We only include language IDs where PYCLD2
successfully output a language and marked the output as reliable.

### Fields of Study

We include the top three level 1 field of study labels generated using the method described in
[Multi-label Classification of Scientific Research Documents Across Domains and Languages](https://aclanthology.org/2022.sdp-1.12) (Toney & Dunham, sdp 2022).
We only include these labels for records with a non-null English title and abstract (as detected by PYCLD2), where the
abstract is over 500 words in length.

### Subject relevance predictions

We share outputs for subject classifiers (for more information on how these classifiers were trained
Expand All @@ -31,7 +24,7 @@ in the following fields:
* `is_nlp` - True if a natural language processing classifier predicted the work was relevant
* `is_ro` - True if a robotics classifier predicted the work was relevant
* `is_ai` - True if an artificial intelligence classifier predicted the work was relevant, or if any of the computer vision, natural language processing, or robotics classifiers predicted the work was relevant
* `is_ai_safety` - True if the artificial intelligence classifier predicted the work was relevant to AI, and a separate AI safety classifier also predicted the work was relevant to AI safety
* `is_cyber` - True if a cybersecurity classifier predicted the work was relevant

## Updating the dataset

Expand Down
2 changes: 1 addition & 1 deletion cset_openalex_augmentation_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
run_dir = "current_run"
public_bucket = "mos-static"
table = "metadata"
gce_resource_id = "cset-openalex-updates"
gce_resource_id = "cset-openalex-updater"

clear_tmp_dir = GCSDeleteObjectsOperator(
task_id="clear_tmp_dir", bucket_name=DATA_BUCKET, prefix=tmp_dir
Expand Down
11 changes: 0 additions & 11 deletions schemas/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,6 @@
"type": "STRING",
"description": "Language of abstract, as detected by pycld2"
},
{
"mode": "REPEATED",
"name": "top_level1_fields",
"type": "STRING",
"description": "Top three level 1 fields of study, using method in https://aclanthology.org/2022.sdp-1.12/"
},
{
"name": "is_ai",
"type": "BOOLEAN",
Expand All @@ -39,10 +33,5 @@
"name": "is_robotics",
"type": "BOOLEAN",
"description": "True if a CSET classifier predicted this work was relevant to AI, False if the classifier predicted that this work was not relevant to Robotics, and null if the classifier was not run on this work (i.e. if the work had a non-English or null title and abstract)."
},
{
"name": "is_ai_safety",
"type": "BOOLEAN",
"description": "True if a CSET classifier predicted this work was relevant to AI Safety, False if the classifier predicted that this work was not relevant to AI, and null if the classifier was not run on this work. This classifier was only run on works that were predicted relevant to AI by the AI classifier (see `is_ai`)"
}
]
146 changes: 71 additions & 75 deletions sql/metadata.sql
Original file line number Diff line number Diff line change
@@ -1,83 +1,81 @@
WITH
field_name_scores AS (
SELECT
merged_id,
name,
field.score AS score
FROM
fields_of_study_v2.field_scores
CROSS JOIN
UNNEST(fields) AS field
LEFT JOIN
fields_of_study_v2.field_meta
ON
field_id = field.id
WHERE
(level = 1)),

field_order AS (
SELECT
merged_id,
name,
score,
ROW_NUMBER() OVER(PARTITION BY merged_id ORDER BY score DESC) AS row_num
FROM
field_name_scores),

top_fields AS (
SELECT
merged_id,
ARRAY_AGG(name ORDER BY score DESC) AS top_level1_fields
FROM
field_order
WHERE
(
row_num < 4
) AND (
merged_id IN (
SELECT merged_id
FROM
literature.papers
WHERE
(
title_english IS NOT NULL
) AND (abstract_english IS NOT NULL) AND (LENGTH(abstract_english) > 500) AND (year > 2010)
)
)
GROUP BY merged_id
),
-- field_name_scores AS (
-- SELECT
-- merged_id,
-- name,
-- field.score AS score
-- FROM
-- fields_of_study_v2.field_scores
-- CROSS JOIN
-- UNNEST(fields) AS field
-- LEFT JOIN
-- fields_of_study_v2.field_meta
-- ON
-- field_id = field.id
-- WHERE
-- (level = 1)),
--
-- field_order AS (
-- SELECT
-- merged_id,
-- name,
-- score,
-- ROW_NUMBER() OVER(PARTITION BY merged_id ORDER BY score DESC) AS row_num
-- FROM
-- field_name_scores),
--
-- top_fields AS (
-- SELECT
-- merged_id,
-- ARRAY_AGG(name ORDER BY score DESC) AS top_level1_fields
-- FROM
-- field_order
-- WHERE
-- (
-- row_num < 4
-- ) AND (
-- merged_id IN (
-- SELECT merged_id
-- FROM
-- literature.papers
-- WHERE
-- (
-- title_english IS NOT NULL
-- ) AND (abstract_english IS NOT NULL) AND (LENGTH(abstract_english) > 500) AND (year > 2010)
-- )
-- )
-- GROUP BY merged_id
-- ),

ai_pubs AS (
SELECT
orig_id,
ai OR nlp OR cv OR robotics AS is_ai,
nlp AS is_nlp,
cv AS is_cv,
robotics AS is_robotics
robotics AS is_robotics,
cyber AS is_cyber
FROM
openalex_article_classification.predictions
WHERE
ai IS TRUE
OR nlp IS TRUE
OR cv IS TRUE
OR robotics IS TRUE
),

ai_safety_pubs AS (
SELECT
orig_id,
preds_str AS is_ai_safety
FROM
ai_safety_openalex.ai_safety_predictions
),
-- ai_safety_pubs AS (
-- SELECT
-- orig_id,
-- preds_str AS is_ai_safety
-- FROM
-- ai_safety_openalex.ai_safety_predictions
-- ),

language_id AS (
SELECT DISTINCT
id,
IF(title_cld2_lid_success AND title_cld2_lid_is_reliable, title_cld2_lid_first_result, NULL) AS title_language,
IF(
LOWER(
IF(title_cld2_lid_success AND title_cld2_lid_is_reliable, title_cld2_lid_first_result, NULL)
) AS title_language,
LOWER(IF(
abstract_cld2_lid_success AND abstract_cld2_lid_is_reliable, abstract_cld2_lid_first_result, NULL
) AS abstract_language
)) AS abstract_language
FROM
staging_literature.all_metadata_with_cld2_lid
)
Expand All @@ -86,26 +84,24 @@ SELECT
id,
title_language,
abstract_language,
top_level1_fields,
-- top_level1_fields,
is_ai,
is_nlp,
is_cv,
is_robotics,
is_ai_safety
is_cyber
-- is_ai_safety
FROM
openalex.works
INNER JOIN
literature.sources
ON id = orig_id
LEFT JOIN
top_fields
USING (merged_id)
-- LEFT JOIN
-- top_fields
-- USING (merged_id)
LEFT JOIN
ai_pubs
ON id = orig_id
LEFT JOIN
ai_safety_pubs
ON id = orig_id
ON id = ai_pubs.orig_id
-- LEFT JOIN
-- ai_safety_pubs
-- ON id = orig_id
LEFT JOIN
language_id
USING (id)

0 comments on commit 1d703f6

Please sign in to comment.