From 319cebea708e4e18e0566f751246589719e31ce8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 11 Nov 2024 01:13:42 +0000 Subject: [PATCH 01/46] Bump huggingface-hub from 0.26.0 to 0.26.2 Bumps [huggingface-hub](https://github.com/huggingface/huggingface_hub) from 0.26.0 to 0.26.2. - [Release notes](https://github.com/huggingface/huggingface_hub/releases) - [Commits](https://github.com/huggingface/huggingface_hub/compare/v0.26.0...v0.26.2) --- updated-dependencies: - dependency-name: huggingface-hub dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3202f1c1..fd6a9a94 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,7 +18,7 @@ ftfy==6.1.3 future==0.18.3 gdown==5.2.0 gensim==4.3.3 -huggingface-hub==0.26.0 +huggingface-hub==0.26.2 hyperopt==0.2.7 idna==3.7 imageio==2.35.1 From 9e375aa3c00b8a26b300a8a33256f41d19d41962 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 11 Nov 2024 01:13:47 +0000 Subject: [PATCH 02/46] Bump networkx from 3.2.1 to 3.4.2 Bumps [networkx](https://github.com/networkx/networkx) from 3.2.1 to 3.4.2. - [Release notes](https://github.com/networkx/networkx/releases) - [Commits](https://github.com/networkx/networkx/compare/networkx-3.2.1...networkx-3.4.2) --- updated-dependencies: - dependency-name: networkx dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3202f1c1..250a1360 100644 --- a/requirements.txt +++ b/requirements.txt @@ -36,7 +36,7 @@ matplotlib==3.9.2 more-itertools==10.5.0 mpld3==0.5.10 mpmath==1.3.0 -networkx==3.2.1 +networkx==3.4.2 numpy==1.26.2 packaging==24.1 pandas==2.2.3 From e80546e2d3849c9d4807b9fbf1b2e976f58c5422 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 11 Nov 2024 14:34:45 +0000 Subject: [PATCH 03/46] Bump idna from 3.7 to 3.10 Bumps [idna](https://github.com/kjd/idna) from 3.7 to 3.10. - [Release notes](https://github.com/kjd/idna/releases) - [Changelog](https://github.com/kjd/idna/blob/master/HISTORY.rst) - [Commits](https://github.com/kjd/idna/compare/v3.7...v3.10) --- updated-dependencies: - dependency-name: idna dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 589e5086..ee520a46 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,7 +20,7 @@ gdown==5.2.0 gensim==4.3.3 huggingface-hub==0.26.2 hyperopt==0.2.7 -idna==3.7 +idna==3.10 imageio==2.35.1 Janome==0.5.0 Jinja2==3.1.4 From aa9ce4c673314dfdab242d12719d1e413d163d55 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 25 Nov 2024 01:19:06 +0000 Subject: [PATCH 04/46] Bump soupsieve from 2.5 to 2.6 Bumps [soupsieve](https://github.com/facelessuser/soupsieve) from 2.5 to 2.6. - [Release notes](https://github.com/facelessuser/soupsieve/releases) - [Commits](https://github.com/facelessuser/soupsieve/compare/2.5...2.6) --- updated-dependencies: - dependency-name: soupsieve dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 589e5086..a08cd09d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -63,7 +63,7 @@ semver==3.0.2 sentencepiece==0.1.99 six==1.16.0 smart-open==7.0.4 -soupsieve==2.5 +soupsieve==2.6 sqlitedict==2.1.0 sympy==1.13.3 tabulate==0.9.0 From 69965e97453dcdb2f7c1261119a13ea0e1d58882 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 2 Dec 2024 02:00:05 +0000 Subject: [PATCH 05/46] Bump tokenizers from 0.20.1 to 0.21.0 Bumps [tokenizers](https://github.com/huggingface/tokenizers) from 0.20.1 to 0.21.0. - [Release notes](https://github.com/huggingface/tokenizers/releases) - [Changelog](https://github.com/huggingface/tokenizers/blob/main/RELEASE.md) - [Commits](https://github.com/huggingface/tokenizers/compare/v0.20.1...v0.21.0) --- updated-dependencies: - dependency-name: tokenizers dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 589e5086..c709208e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -69,7 +69,7 @@ sympy==1.13.3 tabulate==0.9.0 threadpoolctl==3.2.0 tifffile==2024.9.20 -tokenizers==0.20.1 +tokenizers==0.21.0 torch==2.5.0 torchvision==0.20.0 tqdm==4.66.5 From 4c6bb3a2c6f48b01900558542eec311c07ede13d Mon Sep 17 00:00:00 2001 From: davidycliao Date: Mon, 2 Dec 2024 17:23:22 +0000 Subject: [PATCH 06/46] UPDATE --- .Renviron | 2 +- .gitignore | 1 + DESCRIPTION | 1 + README.Rmd | 14 ++++++++++---- README.md | 8 ++++++++ vignettes/test.Rmd | 19 +++++++++++++++++++ 6 files changed, 40 insertions(+), 5 deletions(-) create mode 100644 vignettes/test.Rmd diff --git a/.Renviron b/.Renviron index 383154df..dfc6d026 100644 --- a/.Renviron +++ b/.Renviron @@ -1 +1 @@ -_R_CHECK_FORCE_SUGGESTS_=false +_R_CHECK_FORCE_SUGGESTS_=TRUE diff --git a/.gitignore b/.gitignore index 1c393a6e..2256e6d5 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ files/directories ^tmp.* vignettes/vignettes/inst classifier +/R/note.R diff --git a/DESCRIPTION b/DESCRIPTION index ac0b56ad..e99619f7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -36,6 +36,7 @@ Suggests: purrr, jsonlite, ggplot2, + plotly, testthat (>= 3.0.0) RoxygenNote: 7.3.2 VignetteBuilder: knitr, rmarkdown diff --git a/README.Rmd b/README.Rmd index dbd3a613..80e41c73 100644 --- a/README.Rmd +++ b/README.Rmd @@ -25,7 +25,7 @@ knitr::opts_chunk$set( ```{r include=FALSE} library(reticulate) -library(flaiR) +# library(flaiR) reticulate::py_install("flair") system(paste(reticulate::py_config()$python, "-m pip install flair")) # options(repos = c(CRAN = "https://cloud.r-project.org/")) @@ -75,8 +75,17 @@ __flaiR__ runs the Flair NLP backend in Python, thus requiring Python installati
+ + +## Updates and News + +- [Tutorial for embeddings in flaiR](https://davidycliao.github.io/flaiR/articles/tutorial.html#embedding) + + +
+ ## Contribution and Open Source @@ -93,6 +102,3 @@ The primary communication channel for R users can be found [here](https://github
- - - diff --git a/README.md b/README.md index 621b3100..7fb284ae 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,7 @@ remotes::install_github("davidycliao/flaiR", force = TRUE) ``` r library(flaiR) +#> flaiR: An R Wrapper for Accessing Flair NLP 0.14.0 ```
@@ -71,6 +72,13 @@ these specific versions. +## Updates and News + +- [Tutorial for embeddings in + flaiR](https://davidycliao.github.io/flaiR/articles/tutorial.html#embedding) + +
+ ## Contribution and Open Source
diff --git a/vignettes/test.Rmd b/vignettes/test.Rmd new file mode 100644 index 00000000..fc228e3d --- /dev/null +++ b/vignettes/test.Rmd @@ -0,0 +1,19 @@ +--- +title: "R Notebook" +output: html_notebook +--- + +This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. + +Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. + +```{r} +plot(cars) +``` + +Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Cmd+Option+I*. + +When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Cmd+Shift+K* to preview the HTML file). + +The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike *Knit*, *Preview* does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed. + From 36e17e0b2bfaa71fefeebbd25be8c82dfbd667ff Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 9 Dec 2024 01:56:02 +0000 Subject: [PATCH 07/46] Bump torchvision from 0.20.0 to 0.20.1 Bumps [torchvision](https://github.com/pytorch/vision) from 0.20.0 to 0.20.1. - [Release notes](https://github.com/pytorch/vision/releases) - [Commits](https://github.com/pytorch/vision/compare/v0.20.0...v0.20.1) --- updated-dependencies: - dependency-name: torchvision dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b5263f8b..f44d9e88 100644 --- a/requirements.txt +++ b/requirements.txt @@ -71,7 +71,7 @@ threadpoolctl==3.2.0 tifffile==2024.9.20 tokenizers==0.21.0 torch==2.5.0 -torchvision==0.20.0 +torchvision==0.20.1 tqdm==4.66.5 transformer-smaller-training-vocab==0.4.0 transformers==4.41.1 From ec99bed8573c5f2ef16ba27970a61d8a9e330ef9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 9 Dec 2024 01:57:30 +0000 Subject: [PATCH 08/46] Bump accelerate from 0.28.0 to 1.2.0 Bumps [accelerate](https://github.com/huggingface/accelerate) from 0.28.0 to 1.2.0. - [Release notes](https://github.com/huggingface/accelerate/releases) - [Commits](https://github.com/huggingface/accelerate/compare/v0.28.0...v1.2.0) --- updated-dependencies: - dependency-name: accelerate dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b5263f8b..32741131 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -accelerate==0.28.0 +accelerate==1.2.0 beautifulsoup4==4.12.3 boto3==1.35.23 botocore==1.35.44 From 47a0e3ee3ee9686b9f14fc16dec79bf019eccdf0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 11 Dec 2024 13:25:50 +0000 Subject: [PATCH 09/46] Bump torch from 2.5.0 to 2.5.1 Bumps [torch](https://github.com/pytorch/pytorch) from 2.5.0 to 2.5.1. - [Release notes](https://github.com/pytorch/pytorch/releases) - [Changelog](https://github.com/pytorch/pytorch/blob/main/RELEASE.md) - [Commits](https://github.com/pytorch/pytorch/compare/v2.5.0...v2.5.1) --- updated-dependencies: - dependency-name: torch dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9eb5f59c..e868f732 100644 --- a/requirements.txt +++ b/requirements.txt @@ -70,7 +70,7 @@ tabulate==0.9.0 threadpoolctl==3.2.0 tifffile==2024.9.20 tokenizers==0.21.0 -torch==2.5.0 +torch==2.5.1 torchvision==0.20.1 tqdm==4.66.5 transformer-smaller-training-vocab==0.4.0 From 68a4f71020c3f60b86906b152bb57d50d5420749 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 16 Dec 2024 01:51:17 +0000 Subject: [PATCH 10/46] Bump numpy from 1.26.2 to 2.2.0 Bumps [numpy](https://github.com/numpy/numpy) from 1.26.2 to 2.2.0. - [Release notes](https://github.com/numpy/numpy/releases) - [Changelog](https://github.com/numpy/numpy/blob/main/doc/RELEASE_WALKTHROUGH.rst) - [Commits](https://github.com/numpy/numpy/compare/v1.26.2...v2.2.0) --- updated-dependencies: - dependency-name: numpy dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 487372a4..48d31411 100644 --- a/requirements.txt +++ b/requirements.txt @@ -37,7 +37,7 @@ more-itertools==10.5.0 mpld3==0.5.10 mpmath==1.3.0 networkx==3.4.2 -numpy==1.26.2 +numpy==2.2.0 packaging==24.1 pandas==2.2.3 Pillow==10.2.0 From 03ff4a5e5823903b09ecacaf2877db48a4abbe6a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 16 Dec 2024 01:51:20 +0000 Subject: [PATCH 11/46] Bump wikipedia-api from 0.6.0 to 0.7.1 Bumps [wikipedia-api](https://github.com/martin-majlis/Wikipedia-API) from 0.6.0 to 0.7.1. - [Release notes](https://github.com/martin-majlis/Wikipedia-API/releases) - [Changelog](https://github.com/martin-majlis/Wikipedia-API/blob/master/CHANGES.rst) - [Commits](https://github.com/martin-majlis/Wikipedia-API/compare/v0.6.0...v0.7.1) --- updated-dependencies: - dependency-name: wikipedia-api dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 487372a4..3924032f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -79,5 +79,5 @@ typing_extensions==4.9.0 tzdata==2024.2 urllib3==2.2.2 wcwidth==0.2.13 -Wikipedia-API==0.6.0 +Wikipedia-API==0.7.1 wrapt==1.16.0 From 215732193b499e79c624b5bfe8baaa6ef4fe6bef Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 16 Dec 2024 01:51:23 +0000 Subject: [PATCH 12/46] Bump lazy-loader from 0.3 to 0.4 Bumps [lazy-loader](https://github.com/scientific-python/lazy_loader) from 0.3 to 0.4. - [Release notes](https://github.com/scientific-python/lazy_loader/releases) - [Changelog](https://github.com/scientific-python/lazy-loader/blob/main/CHANGELOG.md) - [Commits](https://github.com/scientific-python/lazy_loader/compare/v0.3...v0.4) --- updated-dependencies: - dependency-name: lazy-loader dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 487372a4..05c94923 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,7 +28,7 @@ jmespath==1.0.1 joblib==1.3.2 kiwisolver==1.4.7 langdetect==1.0.9 -lazy_loader==0.3 +lazy_loader==0.4 lime==0.2.0.1 lxml==4.9.3 MarkupSafe==2.1.3 From f476a4bc5e7f4195a76dd6b54bb5e83c72d8e91b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Dec 2024 02:00:46 +0000 Subject: [PATCH 13/46] Bump markupsafe from 2.1.3 to 3.0.2 Bumps [markupsafe](https://github.com/pallets/markupsafe) from 2.1.3 to 3.0.2. - [Release notes](https://github.com/pallets/markupsafe/releases) - [Changelog](https://github.com/pallets/markupsafe/blob/main/CHANGES.rst) - [Commits](https://github.com/pallets/markupsafe/compare/2.1.3...3.0.2) --- updated-dependencies: - dependency-name: markupsafe dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 48d31411..6eca23d2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,7 +31,7 @@ langdetect==1.0.9 lazy_loader==0.3 lime==0.2.0.1 lxml==4.9.3 -MarkupSafe==2.1.3 +MarkupSafe==3.0.2 matplotlib==3.9.2 more-itertools==10.5.0 mpld3==0.5.10 From 400ae1df9335b11005fab4cc60b4e70709fb74a9 Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sat, 28 Dec 2024 04:45:29 +0000 Subject: [PATCH 14/46] UPLOAD 0.0.7 check news --- DESCRIPTION | 2 +- NAMESPACE | 10 +- NEWS.md | 21 + R/flaiR_loader.R | 164 ---- R/flair_loaders.R | 309 ++++++++ R/get_entities.R | 351 +++++++-- R/get_entities_batch.R | 273 +++---- R/get_pos.R | 6 +- R/get_pos_batch.R | 258 +++---- R/get_sentiments.R | 214 +++--- R/get_sentiments_batch.R | 266 +++---- R/note.R | 354 +++++++++ R/predict_label.R | 88 +++ R/process_embeddings.R | 144 ++++ README.Rmd | 7 +- README.md | 6 +- _pkgdown.yml | 66 +- man/check_tagger.Rd | 19 + man/get_entities.Rd | 162 ++-- man/get_entities_batch.Rd | 76 -- man/get_pos_batch.Rd | 86 --- man/get_sentiments.Rd | 71 -- man/get_sentiments_batch.Rd | 97 --- man/get_tagger_tags.Rd | 65 ++ man/load_tagger_ner.Rd | 39 +- man/load_tagger_pos.Rd | 15 +- man/load_tagger_sentiments.Rd | 30 - man/predict_label.Rd | 59 ++ man/process_embeddings.Rd | 53 ++ renv.lock | 46 +- renv/activate.R | 105 ++- vignettes/get_entities.Rmd | 25 - vignettes/get_pos.Rmd | 20 - vignettes/get_sentiments.Rmd | 122 --- vignettes/test.Rmd | 19 - vignettes/tutorial.Rmd | 1345 +++++++++++++++++++++++++++++---- 36 files changed, 3408 insertions(+), 1585 deletions(-) delete mode 100644 R/flaiR_loader.R create mode 100644 R/flair_loaders.R create mode 100644 R/predict_label.R create mode 100644 R/process_embeddings.R create mode 100644 man/check_tagger.Rd delete mode 100644 man/get_entities_batch.Rd delete mode 100644 man/get_pos_batch.Rd delete mode 100644 man/get_sentiments.Rd delete mode 100644 man/get_sentiments_batch.Rd create mode 100644 man/get_tagger_tags.Rd delete mode 100644 man/load_tagger_sentiments.Rd create mode 100644 man/predict_label.Rd create mode 100644 man/process_embeddings.Rd delete mode 100644 vignettes/get_sentiments.Rmd delete mode 100644 vignettes/test.Rmd diff --git a/DESCRIPTION b/DESCRIPTION index e99619f7..f5cd1cb6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: flaiR Title: An R Wrapper for Accessing FLAIR -Version: 0.0.6 +Version: 0.0.7 Authors@R: c( person("Yen-Chieh", "Liao", email = "davidyclaio@gmail.com", role = c("cre","aut", "ctb")), person("Stefan", "Müller", email = "stefan.mueller@ucd.ie", role = c("aut", "ctb")), diff --git a/NAMESPACE b/NAMESPACE index 733305bc..47a73ecd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,18 +22,16 @@ export(flair_splitter) export(flair_splitter.SegtokSentenceSplitter) export(flair_trainers) export(get_entities) -export(get_entities_batch) export(get_pos) -export(get_pos_batch) -export(get_sentiments) -export(get_sentiments_batch) +export(get_tagger_tags) export(highlight_text) export(import_flair) export(install_python_package) export(load_tagger_ner) export(load_tagger_pos) -export(load_tagger_sentiments) export(map_entities) +export(predict_label) +export(process_embeddings) export(show_flair_cache) export(uninstall_python_package) import(reticulate) @@ -43,9 +41,7 @@ importFrom(data.table,data.table) importFrom(data.table,rbindlist) importFrom(htmltools,HTML) importFrom(reticulate,import) -importFrom(reticulate,py_available) importFrom(reticulate,py_get_attr) importFrom(reticulate,py_install) importFrom(reticulate,py_module_available) -importFrom(stats,setNames) importFrom(stringr,str_replace_all) diff --git a/NEWS.md b/NEWS.md index b130e799..66f0ecc4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,24 @@ +# flaiR 0.0.7 (2024-12-26) + +
+ +### _Enhancements_ + +- Added new tutorial section for embedding extraction from Flair NLP, focusing on embedding-based regression analysis + +### _Function_ + +- Unified POS tagging functions: Combined get_pos() and get_pos_batch() +- Unified entity recognition: Merged get_entities() and get_entities_batch() +- Removed sentiment analysis functions: Deprecated get_sentiment() and get_sentiment_batch()ve get_sentiment() and get_sentimen_batch. merge get_pos and get_pos_batch& get_enetities and get_enetities_batch. + +
+ + + +
+ + # flaiR 0.0.6 (2023-10-29)
diff --git a/R/flaiR_loader.R b/R/flaiR_loader.R deleted file mode 100644 index 6f427ffd..00000000 --- a/R/flaiR_loader.R +++ /dev/null @@ -1,164 +0,0 @@ -#' @title Load the Named Entity Recognition (NER) Tagger -#' -#' @description A helper function to load the appropriate tagger based on the provided language. -#' This function supports a variety of languages/models. -#' -#' @param language A character string indicating the desired language for the NER tagger. -#' If `NULL`, the function will default to the 'pos-fast' model. -#' Supported languages and their models include: -#' \itemize{ -#' \item `"en"` - English NER tagging (`ner`) -#' \item `"de"` - German NER tagging (`de-ner`) -#' \item `"fr"` - French NER tagging (`fr-ner`) -#' \item `"nl"` - Dutch NER tagging (`nl-ner`) -#' \item `"da"` - Danish NER tagging (`da-ner`) -#' \item `"ar"` - Arabic NER tagging (`ar-ner`) -#' \item `"ner-fast"` - English NER fast model (`ner-fast`) -#' \item `"ner-large"` - English NER large mode (`ner-large`) -#' \item `"de-ner-legal"` - NER (legal text) (`de-ner-legal`) -#' \item `"nl"` - Dutch NER tagging (`nl-ner`) -#' \item `"da"` - Danish NER tagging (`da-ner`) -#' \item `"ar"` - Arabic NER tagging (`ar-ner`) -#'} -#' -#' @return An instance of the Flair SequenceTagger for the specified language. -#' -#' @import reticulate -#' @importFrom stats setNames -#' -#' @examples -#' # Load the English NER tagger -#' tagger_en <- load_tagger_ner("en") -#' -#' @export -load_tagger_ner <- function(language = NULL) { - supported_lan_models <- c("ner", "de-ner", - "fr-ner", "nl-ner", - "da-ner", "ar-ner", - "ner-fast", "ner-large", - "ner-pooled", "ner-ontonotes", - "ner-ontonotes-fast", "ner-ontonotes-large", - "de-ner-large", "de-ner-germeval", - "de-ner-legal", "es-ner", - "nl-ner", "nl-ner-large", - "nl-ner-rnn", "ner-ukrainian") - language_model_map <- setNames(supported_lan_models, c("en", "de", - "fr", "nl", - "da", "ar", - "ner-fast", "ner-large", - "ner-pooled", "ner-ontonotes", - "ner-ontonotes-fast", "ner-ontonotes-large", - "de-ner-large", "de-ner-germeval", - "de-ner-legal", "es-ner-large", - "nl-ner", "nl-ner-large", - "nl-ner-rnn", "ner-ukrainian") - ) - - if (is.null(language)) { - language <- "en" - message("Language is not specified. ", language, " in Flair is forceloaded. Please ensure that the internet connectivity is stable.") - } - - # Translate language to model name if necessary - if (language %in% names(language_model_map)) { - language <- language_model_map[[language]] - } - - # Ensure the model is supported - check_language_supported(language = language, supported_lan_models = supported_lan_models) - - # Load the model - SequenceTagger <- reticulate::import("flair.models")$SequenceTagger - SequenceTagger$load(language) -} - -#' @title Load Flair POS Tagger -#' -#' @description This function loads the POS (part-of-speech) tagger model for a specified language -#' using the Flair library. If no language is specified, it defaults to 'pos-fast'. -#' -#' @param language A character string indicating the desired language model. If `NULL`, -#' the function will default to the 'pos-fast' model. Supported language models include: -#' \itemize{ -#' \item "pos" - General POS tagging -#' \item "pos-fast" - Faster POS tagging -#' \item "upos" - Universal POS tagging -#' \item "upos-fast" - Faster Universal POS tagging -#' \item "pos-multi" - Multi-language POS tagging -#' \item "pos-multi-fast" - Faster Multi-language POS tagging -#' \item "ar-pos" - Arabic POS tagging -#' \item "de-pos" - German POS tagging -#' \item "de-pos-tweets" - German POS tagging for tweets -#' \item "da-pos" - Danish POS tagging -#' \item "ml-pos" - Malayalam POS tagging -#' \item "ml-upos" - Malayalam Universal POS tagging -#' \item "pt-pos-clinical" - Clinical Portuguese POS tagging -#' \item "pos-ukrainian" - Ukrainian POS tagging -#' } -#' @return A Flair POS tagger model corresponding to the specified (or default) language. -#' -#' @importFrom reticulate import -#' @export -#' @examples -#' \dontrun{ -#' tagger <- load_tagger_pos("pos-fast") -#' } -load_tagger_pos <- function(language = NULL) { - supported_lan_models <- c("pos", "pos-fast", "upos", "upos-fast", - "pos-multi", "pos-multi-fast", "ar-pos", "de-pos", - "de-pos-tweets", "da-pos", "ml-pos", - "ml-upos", "pt-pos-clinical", "pos-ukrainian") - - if (is.null(language)) { - language <- "pos-fast" - message("Language is not specified. ", language, "in Flair is forceloaded. Please ensure that the internet connectivity is stable. \n") - } - - # Ensure the model is supported - check_language_supported(language = language, supported_lan_models = supported_lan_models) - - # Load the model - flair <- reticulate::import("flair") - Classifier <- flair$nn$Classifier - tagger <- Classifier$load(language) -} - -#' @title Load a Sentiment or Language Tagger Model from Flair -#' -#' @description This function loads a pre-trained sentiment or language tagger -#' from the Flair library. -#' -#' @param language A character string specifying the language model to load. -#' Supported models include: -#' \itemize{ -#' \item "sentiment" - Sentiment analysis model -#' \item "sentiment-fast" - Faster sentiment analysis model -#' \item "de-offensive-language" - German offensive language detection model -#'} If not provided, the function will default to the "sentiment" model. -#' -#' @return An object of the loaded Flair model. -#' -#' @import reticulate -#' @examples -#' \dontrun{ -#' tagger <- load_tagger_sentiments("sentiment") -#' } -#' -#' @export -load_tagger_sentiments <- function(language = NULL) { - supported_lan_models <- c("sentiment", "sentiment-fast", "de-offensive-language") - - if (is.null(language)) { - language <- "sentiment" - message("Language is not specified. ", language, " in Flair is forceloaded. Please ensure that the internet connectivity is stable.") - } - - # Ensure the model is supported - check_language_supported(language = language, supported_lan_models = supported_lan_models) - - # Load the model - flair <- reticulate::import("flair") - Classifier <- flair$nn$Classifier - tagger <- Classifier$load(language) - return(tagger) -} diff --git a/R/flair_loaders.R b/R/flair_loaders.R new file mode 100644 index 00000000..a85b34ec --- /dev/null +++ b/R/flair_loaders.R @@ -0,0 +1,309 @@ +#' @title Load and Configure NER Tagger +#' +#' @description Loads a Named Entity Recognition model from Flair and displays +#' its tag dictionary. Supports both standard NER and OntoNotes models. +#' +#' @param model_name Character string specifying the model to load. +#' Can be "ner" (default), "flair/ner-english-large", or "flair/ner-english-ontonotes" +#' @param show_tags Logical, whether to display the tag dictionary. +#' Default is TRUE. +#' +#' @return A Flair SequenceTagger model object +#' @export +load_tagger_ner <- function(model_name = "ner", show_tags = TRUE) { + if (is.null(model_name)) { + model_name <- "ner" + message("Model name is not specified. Using default 'ner' model.") + } + + # Load the model + tryCatch({ + SequenceTagger <- flair_models()$SequenceTagger + tagger <- SequenceTagger$load(model_name) + + # Extract and organize tags if requested + if (show_tags) { + tag_dict <- tagger$label_dictionary + tag_list <- tag_dict$get_items() + + # Function to extract specific entity types + get_entity_tags <- function(pattern, tags) { + grep(pattern, tags, value = TRUE) + } + + # Group tags by category (supporting both standard and OntoNotes) + categories <- list( + special = get_entity_tags("^<.*>$|^O$", tag_list), + person = get_entity_tags("PERSON", tag_list), + organization = get_entity_tags("ORG", tag_list), + location = get_entity_tags("LOC|GPE", tag_list), + time = get_entity_tags("TIME|DATE", tag_list), + numbers = get_entity_tags("CARDINAL|ORDINAL|PERCENT|MONEY", tag_list), + groups = get_entity_tags("NORP", tag_list), # Nationalities, religious or political groups + facilities = get_entity_tags("FAC", tag_list), # Buildings, airports, highways, bridges + products = get_entity_tags("PRODUCT", tag_list), + events = get_entity_tags("EVENT", tag_list), + art = get_entity_tags("WORK_OF_ART", tag_list), + languages = get_entity_tags("LANGUAGE", tag_list), + laws = get_entity_tags("LAW", tag_list), + misc = get_entity_tags("MISC", tag_list) + ) + + # Print organized output + cat("\nNER Tagger Dictionary:\n") + cat("========================================\n") + cat(sprintf("Total tags: %d\n", length(tag_list))) + cat(sprintf("Model: %s\n", model_name)) + cat("----------------------------------------\n") + + # Print categories with tags + for (cat_name in names(categories)) { + tags <- categories[[cat_name]] + if (length(tags) > 0) { + # Format category name + formatted_name <- gsub("_", " ", tools::toTitleCase(cat_name)) + cat(sprintf("%-15s: %s\n", + formatted_name, + paste(tags, collapse = ", "))) + } + } + + cat("----------------------------------------\n") + cat("Tag scheme: BIOES\n") + cat("B-: Beginning of multi-token entity\n") + cat("I-: Inside of multi-token entity\n") + cat("O: Outside (not part of any entity)\n") + cat("E-: End of multi-token entity\n") + cat("S-: Single token entity\n") + cat("========================================\n") + } + + return(tagger) + + }, error = function(e) { + stop(sprintf( + "Error loading model: %s\nPlease check:\n - Model name is correct\n - Internet connection is stable\n - You have sufficient permissions\nError: %s", + model_name, e$message + )) + }) +} + + +#' @title Extract Model Tags +#' +#' @description Helper function to extract and categorize tags from a loaded Flair +#' SequenceTagger model. The tags are grouped into categories such as person, +#' organization, location, and miscellaneous. +#' +#' @param tagger A loaded Flair SequenceTagger model +#' +#' @return A list of tags grouped by category: +#' \describe{ +#' \item{all}{Complete list of all available tags} +#' \item{special}{Special tags like , O, , } +#' \item{person}{Person-related tags (e.g., B-PER, I-PER)} +#' \item{organization}{Organization tags (e.g., B-ORG, E-ORG)} +#' \item{location}{Location tags (e.g., B-LOC, S-LOC)} +#' \item{misc}{Miscellaneous entity tags} +#' } +#' +#' @details +#' The tags follow the BIOES (Begin, Inside, Outside, End, Single) scheme: +#' \itemize{ +#' \item{B-: Beginning of multi-token entity (e.g., B-PER in "John Smith")} +#' \item{I-: Inside of multi-token entity (e.g., I-PER in "John Smith")} +#' \item{O: Outside of any entity} +#' \item{E-: End of multi-token entity} +#' \item{S-: Single token entity (e.g., S-LOC in "Paris")} +#' } +#' +#' @examples +#' \dontrun{ +#' # Load a NER model +#' tagger <- load_tagger_ner("flair/ner-english-large") +#' +#' # Extract all tags +#' tags <- get_tagger_tags(tagger) +#' +#' # Access specific tag categories +#' print(tags$person) # All person-related tags +#' print(tags$location) # All location-related tags +#' +#' # Example usage with text annotation +#' # B-PER I-PER O S-ORG +#' # "John Smith works at Google" +#' +#' # B-LOC E-LOC O B-ORG E-ORG +#' # "New York is United Nations headquarters" +#' +#' # Use tags to filter entities +#' person_entities <- results[tag %in% tags$person] +#' org_entities <- results[tag %in% tags$organization] +#' } +#' +#' @seealso +#' \code{\link{load_tagger_ner}} for loading the NER model +#' +#' @export +get_tagger_tags <- function(tagger) { + tag_dict <- tagger$label_dictionary + tag_list <- tag_dict$get_items() + + list( + all = tag_list, + special = grep("^<.*>$|^O$", tag_list, value = TRUE), + person = grep("PER", tag_list, value = TRUE), + organization = grep("ORG", tag_list, value = TRUE), + location = grep("LOC", tag_list, value = TRUE), + misc = grep("MISC", tag_list, value = TRUE) + ) +} + + +#' @title Load Flair POS Tagger +#' +#' @description This function loads the POS (part-of-speech) tagger model for a specified language +#' using the Flair library. If no language is specified, it defaults to 'pos-fast'. +#' +#' @param language A character string indicating the desired language model. If `NULL`, +#' the function will default to the 'pos-fast' model. Supported language models include: +#' \itemize{ +#' \item "pos" - General POS tagging +#' \item "pos-fast" - Faster POS tagging +#' \item "upos" - Universal POS tagging +#' \item "upos-fast" - Faster Universal POS tagging +#' \item "pos-multi" - Multi-language POS tagging +#' \item "pos-multi-fast" - Faster Multi-language POS tagging +#' \item "ar-pos" - Arabic POS tagging +#' \item "de-pos" - German POS tagging +#' \item "de-pos-tweets" - German POS tagging for tweets +#' \item "da-pos" - Danish POS tagging +#' \item "ml-pos" - Malayalam POS tagging +#' \item "ml-upos" - Malayalam Universal POS tagging +#' \item "pt-pos-clinical" - Clinical Portuguese POS tagging +#' \item "pos-ukrainian" - Ukrainian POS tagging +#' } +#' @return A Flair POS tagger model corresponding to the specified (or default) language. +#' +#' @importFrom reticulate import +#' @export +#' @examples +#' \dontrun{ +#' tagger <- load_tagger_pos("pos-fast") +#' } +# load_tagger_pos <- function(language = NULL) { +# supported_lan_models <- c("pos", "pos-fast", "upos", "upos-fast", +# "pos-multi", "pos-multi-fast", "ar-pos", "de-pos", +# "de-pos-tweets", "da-pos", "ml-pos", +# "ml-upos", "pt-pos-clinical", "pos-ukrainian") +# +# if (is.null(language)) { +# language <- "pos-fast" +# message("Language is not specified. ", language, "in Flair is forceloaded. Please ensure that the internet connectivity is stable. \n") +# } +# +# # Ensure the model is supported +# check_language_supported(language = language, supported_lan_models = supported_lan_models) +# +# # Load the model +# flair <- reticulate::import("flair") +# Classifier <- flair$nn$Classifier +# tagger <- Classifier$load(language) +# } + +#' @title Load POS (Part-of-Speech) Tagger Model +#' +#' @description Loads a Part-of-Speech tagging model from Flair and displays +#' its tag dictionary in organized categories. +#' +#' @param model_name Character string specifying the model to load. +#' Default is "pos-fast". +#' @param show_tags Logical, whether to display the tag dictionary. +#' Default is TRUE. +#' +#' @return A Flair tagger model object for POS tagging +#' @export +load_tagger_pos <- function(model_name = "pos-fast", show_tags = TRUE) { + if (is.null(model_name)) { + model_name <- "pos-fast" + message("Model name not specified. Using default 'pos-fast' model.") + } + + # Load the model + tryCatch({ + flair <- reticulate::import("flair") + Classifier <- flair$nn$Classifier + + message("Loading POS tagger model: ", model_name) + tagger <- Classifier$load(model_name) + + # Display tag dictionary if requested + if (show_tags) { + tag_dict <- tagger$label_dictionary + tag_list <- tag_dict$get_items() + + # Group tags by category + categories <- list( + special = grep("^<.*>$|^O$", tag_list, value = TRUE), # Special tags + noun = grep("^NN|^PRP|^WP|^EX", tag_list, value = TRUE), # Nouns, pronouns + verb = grep("^VB|^MD", tag_list, value = TRUE), # Verbs, modals + adj = grep("^JJ|^POS", tag_list, value = TRUE), # Adjectives + adv = grep("^RB|^WRB", tag_list, value = TRUE), # Adverbs + det = grep("^DT|^WDT|^PDT", tag_list, value = TRUE), # Determiners + prep = grep("^IN|^TO", tag_list, value = TRUE), # Prepositions + conj = grep("^CC", tag_list, value = TRUE), # Conjunctions + num = grep("^CD", tag_list, value = TRUE), # Numbers + punct = grep("^[[:punct:]]|^-[LR]RB-|^HYPH|^NFP", tag_list, value = TRUE), # Punctuation + other = grep("^FW|^SYM|^ADD|^XX|^UH|^LS|^\\$", tag_list, value = TRUE) # Others + ) + + # Print organized output + cat("\nPOS Tagger Dictionary:\n") + cat("========================================\n") + cat(sprintf("Total tags: %d\n", length(tag_list))) + cat("----------------------------------------\n") + + # Print each category + if (length(categories$special) > 0) + cat("Special: ", paste(categories$special, collapse = ", "), "\n") + if (length(categories$noun) > 0) + cat("Nouns: ", paste(categories$noun, collapse = ", "), "\n") + if (length(categories$verb) > 0) + cat("Verbs: ", paste(categories$verb, collapse = ", "), "\n") + if (length(categories$adj) > 0) + cat("Adjectives: ", paste(categories$adj, collapse = ", "), "\n") + if (length(categories$adv) > 0) + cat("Adverbs: ", paste(categories$adv, collapse = ", "), "\n") + if (length(categories$det) > 0) + cat("Determiners: ", paste(categories$det, collapse = ", "), "\n") + if (length(categories$prep) > 0) + cat("Prepositions: ", paste(categories$prep, collapse = ", "), "\n") + if (length(categories$conj) > 0) + cat("Conjunctions: ", paste(categories$conj, collapse = ", "), "\n") + if (length(categories$num) > 0) + cat("Numbers: ", paste(categories$num, collapse = ", "), "\n") + if (length(categories$punct) > 0) + cat("Punctuation: ", paste(categories$punct, collapse = ", "), "\n") + if (length(categories$other) > 0) + cat("Others: ", paste(categories$other, collapse = ", "), "\n") + + cat("----------------------------------------\n") + cat("Common POS Tag Meanings:\n") + cat("NN*: Nouns (NNP: Proper, NNS: Plural)\n") + cat("VB*: Verbs (VBD: Past, VBG: Gerund)\n") + cat("JJ*: Adjectives (JJR: Comparative)\n") + cat("RB*: Adverbs\n") + cat("PRP: Pronouns, DT: Determiners\n") + cat("IN: Prepositions, CC: Conjunctions\n") + cat("========================================\n") + } + + return(tagger) + + }, error = function(e) { + stop(sprintf( + "Error loading POS model: %s\n Please check:\n - Model name is correct\n - Internet connection is stable\n - You have sufficient permissions\nError: %s", + model_name, e$message + )) + }) +} diff --git a/R/get_entities.R b/R/get_entities.R index bcce0100..3d34f71e 100644 --- a/R/get_entities.R +++ b/R/get_entities.R @@ -1,75 +1,175 @@ -#' @title Tagging Named Entities with Flair Models +#' @title Check if Tagger is Valid #' -#' @description This function takes texts and their corresponding document IDs -#' as inputs, uses the Flair NLP library to extract named entities, -#' and returns a dataframe of the identified entities along with their tags. -#' When no entities are detected in a text, the function returns a data table -#' with NA values. This might clutter the results. Depending on your use case, -#' you might decide to either keep this behavior or skip rows with no detected -#' entities. +#' @description Internal function to verify if the provided tagger is valid +#' and has the required methods. +#' +#' @param tagger A Flair tagger object to check +#' @return Logical indicating if the tagger is valid +#' @keywords internal +check_tagger <- function(tagger) { + if (is.null(tagger)) { + stop("Tagger cannot be NULL. Please provide a valid Flair tagger object.", + "\nExample: tagger_ner <- load_tagger_ner('ner')") + } + + # Check if tagger has required methods + required_methods <- c("predict", "to") + has_methods <- sapply(required_methods, function(method) { + !is.null(tagger[[method]]) && is.function(tagger[[method]]) + }) + + if (!all(has_methods)) { + missing_methods <- required_methods[!has_methods] + stop("Invalid tagger object. Missing required methods: ", + paste(missing_methods, collapse = ", ")) + } + + return(TRUE) +} +#' @title Extract Named Entities from Texts with Batch Processing +#' +#' @description This function processes texts in batches and extracts named entities +#' using the Flair NLP library. It supports both standard NER and OntoNotes models, +#' with options for batch processing and GPU acceleration. #' #' @param texts A character vector containing the texts to process. #' @param doc_ids A character or numeric vector containing the document IDs #' corresponding to each text. -#' @param tagger An optional tagger object. If NULL (default), the function will -#' load a Flair tagger based on the specified language. -#' @param language A character string indicating the language model to load. -#' Default is "en". +#' @param tagger A Flair tagger object for named entity recognition. Must be provided +#' by the user. Can be created using load_tagger_ner() with different models: +#' \itemize{ +#' \item Standard NER: tagger_ner <- load_tagger_ner('ner') +#' \item OntoNotes: tagger_ner <- load_tagger_ner('flair/ner-english-ontonotes') +#' \item Large model: tagger_ner <- load_tagger_ner('flair/ner-english-large') +#' } #' @param show.text_id A logical value. If TRUE, includes the actual text from -#' which the entity was extracted in the resulting data table. Useful for -#' verification and traceability purposes but might increase the size of -#' the output. Default is FALSE. +#' which the entity was extracted. Default is FALSE. #' @param gc.active A logical value. If TRUE, runs the garbage collector after -#' processing all texts. This can help in freeing up memory by releasing unused -#' memory space, especially when processing a large number of texts. -#' Default is FALSE. +#' processing texts. Default is FALSE. +#' @param batch_size An integer specifying the size of each batch. Set to 1 for +#' single-text processing. Default is 5. +#' @param device A character string specifying the computation device ("cpu", +#' "cuda:0", "cuda:1", etc.). Default is "cpu". Note: MPS (Mac M1/M2) is currently +#' not fully supported and will default to CPU. +#' @param verbose A logical value. If TRUE, prints processing progress. Default is FALSE. +#' #' @return A data table with columns: #' \describe{ -#' \item{doc_id}{The ID of the document from which the entity was extracted.} -#' \item{text_id}{If TRUE, the actual text from which the entity -#' was extracted.} -#' \item{entity}{The named entity that was extracted from the text.} -#' \item{tag}{The tag or category of the named entity. Common tags include: -#' PERSON (names of individuals), -#' ORG (organizations, institutions), -#' GPE (countries, cities, states), -#' LOCATION (mountain ranges, bodies of water), -#' DATE (dates or periods), -#' TIME (times of day), -#' MONEY (monetary values), -#' PERCENT (percentage values), -#' FACILITY (buildings, airports), -#' PRODUCT (objects, vehicles), -#' EVENT (named events like wars or sports events), -#' ART (titles of books)}} +#' \item{doc_id}{Character or numeric. The ID of the document from which the +#' entity was extracted.} +#' \item{text_id}{Character. The complete text from which the entity was +#' extracted. Only included when show.text_id = TRUE.} +#' \item{entity}{Character. The actual named entity text that was extracted. +#' Will be NA if no entity was found.} +#' \item{tag}{Character. The category of the named entity. Available tags depend on +#' the model used: +#' \itemize{ +#' \item{Standard NER tags:} +#' \itemize{ +#' \item{PERSON: Names of people} +#' \item{ORG: Organizations} +#' \item{LOC: Locations} +#' \item{MISC: Miscellaneous entities} +#' } +#' \item{OntoNotes tags:} +#' \itemize{ +#' \item{PERSON: People, including fictional characters} +#' \item{ORG: Companies, agencies, institutions} +#' \item{GPE: Countries, cities, states} +#' \item{LOC: Non-GPE locations, mountains, water bodies} +#' \item{DATE: Absolute or relative dates} +#' \item{TIME: Times of day} +#' \item{MONEY: Monetary values} +#' \item{PERCENT: Percentage values} +#' \item{CARDINAL: Numerals} +#' \item{ORDINAL: Ordinal numbers} +#' \item{NORP: Nationalities, religious, or political groups} +#' \item{FAC: Buildings, airports, highways, bridges} +#' \item{WORK_OF_ART: Titles of books, songs, etc.} +#' \item{LAW: Named documents made into laws} +#' \item{LANGUAGE: Named languages} +#' } +#' }} +#' } +#' +#' @section Tag Format: +#' All tags use the BIOES (Begin, Inside, Outside, End, Single) scheme: +#' \itemize{ +#' \item{B-: Beginning of multi-token entity (e.g., B-PERSON in "John Smith")} +#' \item{I-: Inside of multi-token entity} +#' \item{O: Outside (not part of any entity)} +#' \item{E-: End of multi-token entity} +#' \item{S-: Single token entity (e.g., S-LOC in "Paris")} +#' } +#' #' @examples #' \dontrun{ #' library(reticulate) #' library(fliaR) #' -#' texts <- c("UCD is one of the best universities in Ireland.", -#' "UCD has a good campus but is very far from -#' my apartment in Dublin.", -#' "Essex is famous for social science research.", -#' "Essex is not in the Russell Group, but it is -#' famous for political science research.", -#' "TCD is the oldest university in Ireland.", -#' "TCD is similar to Oxford.") -#' doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") -#' # Load NER ("ner") model -#' tagger_ner <- load_tagger_ner('ner') -#' results <- get_entities(texts, doc_ids, tagger_ner) -#' print(results)} +#' # Using standard NER model +#' tagger_std <- load_tagger_ner('ner') +#' +#' # Using OntoNotes model +#' tagger_onto <- load_tagger_ner('flair/ner-english-ontonotes') +#' +#' texts <- c( +#' "John Smith works at Google in New York.", +#' "The Eiffel Tower was built in 1889." +#' ) +#' doc_ids <- c("doc1", "doc2") +#' +#' # Process with standard NER +#' results_std <- get_entities( +#' texts = texts, +#' doc_ids = doc_ids, +#' tagger = tagger_std, +#' batch_size = 2, +#' verbose = TRUE +#' ) +#' +#' # Process with OntoNotes model +#' results_onto <- get_entities( +#' texts = texts, +#' doc_ids = doc_ids, +#' tagger = tagger_onto, +#' batch_size = 2, +#' verbose = TRUE +#' ) +#' +#' # Filter specific entity types +#' persons <- results_onto[grepl("PERSON", tag)] +#' locations <- results_onto[grepl("LOC|GPE", tag)] +#' dates <- results_onto[grepl("DATE", tag)] +#' } #' #' @importFrom data.table data.table rbindlist #' @importFrom reticulate import #' @importFrom data.table := #' @export -get_entities <- function(texts, doc_ids = NULL, tagger = NULL, language = NULL, - show.text_id = FALSE, gc.active = FALSE) { +get_entities <- function(texts, doc_ids = NULL, tagger, show.text_id = FALSE, + gc.active = FALSE, batch_size = 5, device = "cpu", + verbose = FALSE) { + + if (verbose) { + message("Starting entity extraction process...") + message("Checking tagger...") + } - # Check environment pre-requisites + # Check tagger + check_tagger(tagger) + + if (verbose) { + message("Tagger validation successful") + message("Number of texts to process: ", length(texts)) + message("Batch size: ", batch_size) + message("Device: ", device) + } + + # Check environment and parameters check_prerequisites() + check_device(device) + check_batch_size(batch_size) check_show.text_id(show.text_id) # Check and prepare texts and doc_ids @@ -77,45 +177,142 @@ get_entities <- function(texts, doc_ids = NULL, tagger = NULL, language = NULL, texts <- texts_and_ids$texts doc_ids <- texts_and_ids$doc_ids - # Load tagger if null - if (is.null(tagger)) { - tagger <- load_tagger_ner(language) + # Set device for processing + if (device != "cpu") { + tryCatch({ + tagger$to(device) + }, error = function(e) { + warning(sprintf("Error setting device %s: %s\nDefaulting to CPU.", + device, e$message)) + device <- "cpu" + }) } Sentence <- reticulate::import("flair")$data$Sentence - # Process each text and extract entities - process_text <- function(text, doc_id) { - text_id <- NULL - if (is.na(text) || is.na(doc_id)) { - return(data.table(doc_id = NA, entity = NA, tag = NA)) + # Helper function for progress bar + create_progress_bar <- function(current, total, width = 50) { + percent <- current / total + filled <- round(width * percent) + empty <- width - filled + bar <- paste0( + "[", + strrep("=", filled), + ">", + strrep(" ", empty), + "] ", + sprintf("%3d%%", round(percent * 100)) + ) + return(bar) + } + + # Process batch of texts and extract entities + process_batch <- function(batch_texts, batch_doc_ids, batch_num, total_batches) { + if (verbose) { + progress_text <- sprintf("\rBatch %d/%d %s Processing %d texts...", + batch_num, total_batches, + create_progress_bar(batch_num, total_batches), + length(batch_texts)) + cat(progress_text) } - sentence <- Sentence(text) - tagger$predict(sentence) - entities <- sentence$get_spans("ner") + results_list <- lapply(seq_along(batch_texts), function(i) { + text <- batch_texts[[i]] + doc_id <- batch_doc_ids[[i]] + + if (is.na(text) || is.na(doc_id)) { + if (verbose) message("Skipping NA text or doc_id") + return(data.table( + doc_id = NA, + entity = NA, + tag = NA, + score = NA + )) + } + + # Error handling for sentence prediction + tryCatch({ + sentence <- Sentence(text) + tagger$predict(sentence) + entities <- sentence$get_spans("ner") - if (length(entities) == 0) { - return(data.table(doc_id = doc_id, entity = NA, tag = NA)) + if (length(entities) == 0) { + return(data.table( + doc_id = doc_id, + entity = NA, + tag = NA, + score = NA + )) + } + + # Create data table with entity information using vapply + dt <- data.table( + doc_id = rep(doc_id, length(entities)), + entity = vapply(entities, function(e) e$text, character(1)), + tag = vapply(entities, function(e) e$tag, character(1)), + score = vapply(entities, function(e) e$score, numeric(1)) + ) + + if (isTRUE(show.text_id)) { + dt[, text_id := text] + } + + return(dt) + }, error = function(e) { + warning(sprintf("Error processing text %d: %s", i, e$message)) + return(data.table( + doc_id = doc_id, + entity = NA, + tag = NA, + score = NA + )) + }) + }) + + if (verbose) { + cat("\r", strrep(" ", 80), "\r") # Clear current line + cat(sprintf("Batch %d/%d completed\n", batch_num, total_batches)) } - # Unified data table creation process - dt <- data.table( - doc_id = rep(doc_id, length(entities)), - entity = vapply(entities, function(e) e$text, character(1)), - tag = vapply(entities, function(e) e$tag, character(1)) + return(rbindlist(results_list, fill = TRUE)) + } + + # Process all batches + num_batches <- ceiling(length(texts) / batch_size) + all_results <- lapply(1:num_batches, function(b) { + start_idx <- (b - 1) * batch_size + 1 + end_idx <- min(b * batch_size, length(texts)) + + batch_result <- process_batch( + texts[start_idx:end_idx], + doc_ids[start_idx:end_idx], + b, + num_batches ) - if (isTRUE(show.text_id)) { - dt[, text_id := text] + # Run garbage collection if requested + if (gc.active) { + if (verbose) message("Running garbage collection...") + check_and_gc(gc.active) } - return(dt) + return(batch_result) + }) + + # Combine all results + final_results <- rbindlist(all_results, fill = TRUE) + + # Final summary + if (verbose) { + total_entities <- nrow(final_results[!is.na(entity)]) + message("\n", strrep("=", 60)) + message("Processing Summary:") + message(strrep("-", 60)) + message(sprintf("Total texts processed: %d", length(texts))) + message(sprintf("Total entities found: %d", total_entities)) + message(sprintf("Average entities per text: %.1f", total_entities/length(texts))) + message(strrep("=", 60)) } - # Activate garbage collection - check_and_gc(gc.active) - results_list <- lapply(seq_along(texts), - function(i) {process_text(texts[[i]], doc_ids[[i]])}) - rbindlist(results_list, fill = TRUE) + return(final_results) } diff --git a/R/get_entities_batch.R b/R/get_entities_batch.R index 6ef47956..a60d2d6f 100644 --- a/R/get_entities_batch.R +++ b/R/get_entities_batch.R @@ -1,136 +1,137 @@ -#' @title Extract Named Entities from a Batch of Texts -#' -#' @description This function processes batches of texts and extracts named entities. -#' -#' @param texts A character vector of texts to process. -#' @param doc_ids A vector of document IDs corresponding to each text. -#' @param tagger A pre-loaded Flair NER tagger. Default is NULL, and the tagger is loaded based on the provided language. -#' @param language A character string specifying the language of the texts. Default is "en" (English). -#' @param show.text_id Logical, whether to include the text ID in the output. Default is FALSE. -#' @param gc.active Logical, whether to activate garbage collection after processing each batch. Default is FALSE. -#' @param batch_size An integer specifying the size of each batch. Default is 5. -#' @param device A character string specifying the computation device. -#' It can be either "cpu" or a string representation of a GPU device number. -#' For instance, "0" corresponds to the first GPU. If a GPU device number -#' is provided, it will attempt to use that GPU. The default is "cpu". -#' \itemize{ -#' \item{"cuda" or "cuda:0" ("mps" or "mps:0" in Mac M1/M2 )}{Refers to the first GPU in the system. If -#' there's only one GPU, specifying "cuda" or "cuda:0" will allocate -#' computations to this GPU.} -#' \item{"cuda:1" ("mps:1")}{Refers to the second GPU in the system, allowing allocation -#' of specific computations to this GPU.} -#' \item{"cuda:2" ("mps:2)}{Refers to the third GPU in the system, and so on for systems -#' with more GPUs.} -#' } -#' -#' @param verbose A logical value. If TRUE, the function prints batch processing -#' progress updates. Default is TRUE. -#' -#' @return A data.table containing the extracted entities, their corresponding -#' tags, and document IDs. -#' -#' @importFrom data.table data.table rbindlist -#' @importFrom reticulate import -#' @importFrom data.table := -#' @examples -#' \dontrun{ -#' library(reticulate) -#' library(fliaR) -#' -#' texts <- c("UCD is one of the best universities in Ireland.", -#' "UCD has a good campus but is very far from -#' my apartment in Dublin.", -#' "Essex is famous for social science research.", -#' "Essex is not in the Russell Group, but it is -#' famous for political science research.", -#' "TCD is the oldest university in Ireland.", -#' "TCD is similar to Oxford.") -#' doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") -#' # Load NER ("ner") model -#' tagger_ner <- load_tagger_ner('ner') -#' results <- get_entities_batch(texts, doc_ids, tagger_ner) -#' print(results)} -#' @export -get_entities_batch <- function(texts, doc_ids, tagger = NULL, language = "en", - show.text_id = FALSE, gc.active = FALSE, - batch_size = 5, device = "cpu", verbose = TRUE) { - - # Check environment pre-requisites and parameters - check_prerequisites() - check_device(device) - check_batch_size(batch_size) - check_texts_and_ids(texts, doc_ids) - check_show.text_id(show.text_id) - - # Load tagger if null - if (is.null(tagger)) { - tagger <- load_tagger_ner(language) - } - - Sentence <- reticulate::import("flair")$data$Sentence - - # Entity Extraction function to process multiple texts in one call - process_texts_batch <- function(batch_texts, batch_doc_ids) { - text_id <- NULL - if (length(batch_texts) != length(batch_doc_ids)) { - stop("The lengths of batch_texts and batch_doc_ids do not match.") - } - - results_list <- lapply(seq_along(batch_texts), function(i) { - text <- batch_texts[[i]] - doc_id <- batch_doc_ids[[i]] - - if (is.na(text) || is.na(doc_id)) { - return(data.table(doc_id = NA, - entity = NA, - tag = NA, - text_id = ifelse(show.text_id, text, NA)) - ) - } - - sentence <- Sentence(text) - tagger$predict(sentence) - entities <- sentence$get_spans("ner") - - if (length(entities) == 0) { - return(data.table(doc_id = doc_id, - entity = NA, - tag = NA, - text_id = ifelse(show.text_id, text, NA))) - } - - # Unified data table creation process - dt <- data.table( - doc_id = rep(doc_id, length(entities)), - entity = vapply(entities, function(e) e$text, character(1)), - tag = vapply(entities, function(e) e$tag, character(1)) - ) - - if (isTRUE(show.text_id)) { - dt[, text_id := text] - } - - return(dt) - }) - - return(rbindlist(results_list, fill = TRUE)) # Return the combined result for this batch - } - - # Batch processing - num_batches <- ceiling(length(texts) / batch_size) - batched_results <- lapply(1:num_batches, function(b) { - # Print the progress if verbose is TRUE - if (isTRUE(verbose)) { - cat(sprintf("Processing batch %d out of %d...\n", b, num_batches)) - } - - start_idx <- (b - 1) * batch_size + 1 - end_idx <- min(b * batch_size, length(texts)) - return(process_texts_batch(texts[start_idx:end_idx], doc_ids[start_idx:end_idx])) - }) - - # Activate garbage collection - check_and_gc(gc.active) - - return(rbindlist(batched_results, fill = TRUE)) -} +#' #' @title Extract Named Entities from a Batch of Texts +#' #' +#' #' @description This function processes batches of texts and extracts named entities. +#' #' +#' #' @param texts A character vector of texts to process. +#' #' @param doc_ids A vector of document IDs corresponding to each text. +#' #' @param tagger A pre-loaded Flair NER tagger. Default is NULL, and the tagger is loaded based on the provided language. +#' #' @param language A character string specifying the language of the texts. Default is "en" (English). +#' #' @param show.text_id Logical, whether to include the text ID in the output. Default is FALSE. +#' #' @param gc.active Logical, whether to activate garbage collection after processing each batch. Default is FALSE. +#' #' @param batch_size An integer specifying the size of each batch. Default is 5. +#' #' @param device A character string specifying the computation device. +#' #' It can be either "cpu" or a string representation of a GPU device number. +#' #' For instance, "0" corresponds to the first GPU. If a GPU device number +#' #' is provided, it will attempt to use that GPU. The default is "cpu". +#' #' \itemize{ +#' #' \item{"cuda" or "cuda:0" ("mps" or "mps:0" in Mac M1/M2 )}{Refers to the first GPU in the system. If +#' #' there's only one GPU, specifying "cuda" or "cuda:0" will allocate +#' #' computations to this GPU.} +#' #' \item{"cuda:1" ("mps:1")}{Refers to the second GPU in the system, allowing allocation +#' #' of specific computations to this GPU.} +#' #' \item{"cuda:2" ("mps:2)}{Refers to the third GPU in the system, and so on for systems +#' #' with more GPUs.} +#' #' } +#' #' +#' #' @param verbose A logical value. If TRUE, the function prints batch processing +#' #' progress updates. Default is TRUE. +#' #' +#' #' @return A data.table containing the extracted entities, their corresponding +#' #' tags, and document IDs. +#' #' +#' #' @importFrom data.table data.table rbindlist +#' #' @importFrom reticulate import +#' #' @importFrom data.table := +#' #' @examples +#' #' \dontrun{ +#' #' library(reticulate) +#' #' library(fliaR) +#' #' +#' #' texts <- c("UCD is one of the best universities in Ireland.", +#' #' "UCD has a good campus but is very far from +#' #' my apartment in Dublin.", +#' #' "Essex is famous for social science research.", +#' #' "Essex is not in the Russell Group, but it is +#' #' famous for political science research.", +#' #' "TCD is the oldest university in Ireland.", +#' #' "TCD is similar to Oxford.") +#' #' doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") +#' #' # Load NER ("ner") model +#' #' tagger_ner <- load_tagger_ner('ner') +#' #' results <- get_entities_batch(texts, doc_ids, tagger_ner) +#' #' print(results)} +#' #' @export + +# get_entities_batch <- function(texts, doc_ids, tagger = NULL, language = "en", +# show.text_id = FALSE, gc.active = FALSE, +# batch_size = 5, device = "cpu", verbose = TRUE) { +# +# # Check environment pre-requisites and parameters +# check_prerequisites() +# check_device(device) +# check_batch_size(batch_size) +# check_texts_and_ids(texts, doc_ids) +# check_show.text_id(show.text_id) +# +# # Load tagger if null +# if (is.null(tagger)) { +# tagger <- load_tagger_ner(language) +# } +# +# Sentence <- reticulate::import("flair")$data$Sentence +# +# # Entity Extraction function to process multiple texts in one call +# process_texts_batch <- function(batch_texts, batch_doc_ids) { +# text_id <- NULL +# if (length(batch_texts) != length(batch_doc_ids)) { +# stop("The lengths of batch_texts and batch_doc_ids do not match.") +# } +# +# results_list <- lapply(seq_along(batch_texts), function(i) { +# text <- batch_texts[[i]] +# doc_id <- batch_doc_ids[[i]] +# +# if (is.na(text) || is.na(doc_id)) { +# return(data.table(doc_id = NA, +# entity = NA, +# tag = NA, +# text_id = ifelse(show.text_id, text, NA)) +# ) +# } +# +# sentence <- Sentence(text) +# tagger$predict(sentence) +# entities <- sentence$get_spans("ner") +# +# if (length(entities) == 0) { +# return(data.table(doc_id = doc_id, +# entity = NA, +# tag = NA, +# text_id = ifelse(show.text_id, text, NA))) +# } +# +# # Unified data table creation process +# dt <- data.table( +# doc_id = rep(doc_id, length(entities)), +# entity = vapply(entities, function(e) e$text, character(1)), +# tag = vapply(entities, function(e) e$tag, character(1)) +# ) +# +# if (isTRUE(show.text_id)) { +# dt[, text_id := text] +# } +# +# return(dt) +# }) +# +# return(rbindlist(results_list, fill = TRUE)) # Return the combined result for this batch +# } +# +# # Batch processing +# num_batches <- ceiling(length(texts) / batch_size) +# batched_results <- lapply(1:num_batches, function(b) { +# # Print the progress if verbose is TRUE +# if (isTRUE(verbose)) { +# cat(sprintf("Processing batch %d out of %d...\n", b, num_batches)) +# } +# +# start_idx <- (b - 1) * batch_size + 1 +# end_idx <- min(b * batch_size, length(texts)) +# return(process_texts_batch(texts[start_idx:end_idx], doc_ids[start_idx:end_idx])) +# }) +# +# # Activate garbage collection +# check_and_gc(gc.active) +# +# return(rbindlist(batched_results, fill = TRUE)) +# } diff --git a/R/get_pos.R b/R/get_pos.R index fff7ff01..b763a033 100644 --- a/R/get_pos.R +++ b/R/get_pos.R @@ -85,7 +85,7 @@ get_pos <- function(texts, doc_ids = NULL, tagger = NULL, language = NULL, text_id = ifelse(show.text_id, text, NA), token = NA, tag = NA, - precision = NA)) + score = NA)) } sentence <- Sentence(texts[[i]]) tagger$predict(sentence) @@ -99,7 +99,7 @@ get_pos <- function(texts, doc_ids = NULL, tagger = NULL, language = NULL, text_id = ifelse(show.text_id, text, NA), token = NA, tag = NA, - precision = NA)) + score = NA)) } else { return(data.table( doc_id = rep(doc_ids[[i]], length(tag_list)), @@ -107,7 +107,7 @@ get_pos <- function(texts, doc_ids = NULL, tagger = NULL, language = NULL, text_id = ifelse(show.text_id, text, NA), token = vapply(tag_list, function(x) gsub('^Token\\[\\d+\\]: "(.*)" .*', '\\1', x), character(1)), tag = vapply(tag_list, function(x) gsub('^Token\\[\\d+\\]: ".*" \u2192 (.*) \\(.*\\)', '\\1', x), character(1)), - precision = as.numeric(vapply(tag_list, function(x) gsub(".*\\((.*)\\)", "\\1", x), character(1))) + score = as.numeric(vapply(tag_list, function(x) gsub(".*\\((.*)\\)", "\\1", x), character(1))) )) } } diff --git a/R/get_pos_batch.R b/R/get_pos_batch.R index e093468a..03df53aa 100644 --- a/R/get_pos_batch.R +++ b/R/get_pos_batch.R @@ -1,137 +1,137 @@ -#' @title Batch Process of Part-of-Speech Tagging +#' #' @title Batch Process of Part-of-Speech Tagging +#' #' +#' #' @description This function returns a data table of POS tags and other related +#' #' data for the given texts using batch processing. +#' #' +#' #' @param texts A character vector containing texts to be processed. +#' #' @param doc_ids A character vector containing document ids. +#' #' @param tagger A tagger object (default is NULL). +#' #' @param language The language of the texts (default is NULL). +#' #' @param show.text_id A logical value. If TRUE, includes the actual text from +#' #' which the entity was extracted in the resulting data table. Useful for +#' #' verification and traceability purposes but might increase the size of +#' #' the output. Default is FALSE. +#' #' @param gc.active A logical value. If TRUE, runs the garbage collector after +#' #' processing all texts. This can help in freeing up memory by releasing unused +#' #' memory space, especially when processing a large number of texts. +#' #' Default is FALSE. +#' #' @param batch_size An integer specifying the size of each batch. Default is 5. +#' #' @param device A character string specifying the computation device. +#' #' \itemize{ +#' #' \item{"cuda" or "cuda:0" ("mps" or "mps:0" in Mac M1/M2 )}{Refers to the first GPU in the system. If +#' #' there's only one GPU, specifying "cuda" or "cuda:0" will allocate +#' #' computations to this GPU.} +#' #' \item{"cuda:1" ("mps:1")}{Refers to the second GPU in the system, allowing allocation +#' #' of specific computations to this GPU.} +#' #' \item{"cuda:2" ("mps:2)}{Refers to the third GPU in the system, and so on for systems +#' #' with more GPUs.} +#' #' } +#' #' +#' #' @param verbose A logical value. If TRUE, the function prints batch processing +#' #' progress updates. Default is TRUE. +#' #' +#' #' @return A data.table containing the following columns: +#' #' \describe{ +#' #' \item{`doc_id`}{The document identifier corresponding to each text.} +#' #' \item{`token_id`}{The token number in the original text, +#' #' indicating the position of the token.} +#' #' \item{`text_id`}{The actual text input passed to the function (if show.text_id is TRUE).} +#' #' \item{`token`}{The individual word or token from the text that was +#' #' POS tagged.} +#' #' \item{`tag`}{The part-of-speech tag assigned to the token by +#' #' the Flair library.} +#' #' \item{`precision`}{A confidence score (numeric) for the +#' #' assigned POS tag.} +#' #' } +#' #' +#' #' @import reticulate +#' #' @export +#' #' +#' #' @examples +#' #' \dontrun{ +#' #' library(reticulate) +#' #' library(fliaR) +#' #' tagger_pos_fast <- load_tagger_pos('pos-fast') +#' #' texts <- c("UCD is one of the best universities in Ireland.", +#' #' "Essex is not in the Russell Group, but it is famous for political science research.", +#' #' "TCD is the oldest university in Ireland.") +#' #' doc_ids <- c("doc1", "doc2", "doc3") +#' #' +#' #' # Using the batch_size parameter +#' #' get_pos_batch(texts, doc_ids, tagger_pos_fast, batch_size = 2) +#' #' } #' -#' @description This function returns a data table of POS tags and other related -#' data for the given texts using batch processing. +#' get_pos_batch <- function(texts, doc_ids, tagger = NULL, language = NULL, +#' show.text_id = FALSE, gc.active = FALSE, +#' batch_size = 5, device = "cpu", verbose = TRUE) { #' -#' @param texts A character vector containing texts to be processed. -#' @param doc_ids A character vector containing document ids. -#' @param tagger A tagger object (default is NULL). -#' @param language The language of the texts (default is NULL). -#' @param show.text_id A logical value. If TRUE, includes the actual text from -#' which the entity was extracted in the resulting data table. Useful for -#' verification and traceability purposes but might increase the size of -#' the output. Default is FALSE. -#' @param gc.active A logical value. If TRUE, runs the garbage collector after -#' processing all texts. This can help in freeing up memory by releasing unused -#' memory space, especially when processing a large number of texts. -#' Default is FALSE. -#' @param batch_size An integer specifying the size of each batch. Default is 5. -#' @param device A character string specifying the computation device. -#' \itemize{ -#' \item{"cuda" or "cuda:0" ("mps" or "mps:0" in Mac M1/M2 )}{Refers to the first GPU in the system. If -#' there's only one GPU, specifying "cuda" or "cuda:0" will allocate -#' computations to this GPU.} -#' \item{"cuda:1" ("mps:1")}{Refers to the second GPU in the system, allowing allocation -#' of specific computations to this GPU.} -#' \item{"cuda:2" ("mps:2)}{Refers to the third GPU in the system, and so on for systems -#' with more GPUs.} -#' } +#' # Check environment pre-requisites and parameters +#' check_prerequisites() +#' check_texts_and_ids(texts, doc_ids) +#' check_show.text_id(show.text_id) +#' check_device(device) +#' check_batch_size(batch_size) #' -#' @param verbose A logical value. If TRUE, the function prints batch processing -#' progress updates. Default is TRUE. +#' # Import the `Sentence` tokenizer and `Classifier` from Python's Flair +#' flair <- reticulate::import("flair") +#' Sentence <- flair$data$Sentence #' -#' @return A data.table containing the following columns: -#' \describe{ -#' \item{`doc_id`}{The document identifier corresponding to each text.} -#' \item{`token_id`}{The token number in the original text, -#' indicating the position of the token.} -#' \item{`text_id`}{The actual text input passed to the function (if show.text_id is TRUE).} -#' \item{`token`}{The individual word or token from the text that was -#' POS tagged.} -#' \item{`tag`}{The part-of-speech tag assigned to the token by -#' the Flair library.} -#' \item{`precision`}{A confidence score (numeric) for the -#' assigned POS tag.} -#' } +#' # Load tagger if null +#' if (is.null(tagger)) { +#' tagger <- load_tagger_pos(language) +#' } +#' +#' # Function to process a single sentence +#' process_single_sentence <- function(sentence, doc_id) { +#' text <- sentence$text +#' tag_list <- sentence$labels +#' +#' # Check if there are no pos tag in tag_list if tag_list empty returns NAs +#' if (length(tag_list) == 0) { +#' return(data.table(doc_id = doc_id, +#' token_id = NA, +#' text_id = ifelse(show.text_id, text, NA), +#' token = NA, +#' tag = NA, +#' precision = NA)) +#' } else { +#' return(data.table( +#' doc_id = rep(doc_id, length(tag_list)), +#' token_id = as.numeric(vapply(tag_list, function(x) gsub("^Token\\[([0-9]+)\\].*$", "\\1", x), character(1))), +#' text_id = ifelse(show.text_id, text, NA), +#' token = vapply(tag_list, function(x) gsub('^Token\\[\\d+\\]: "(.*)" .*', '\\1', x), character(1)), +#' tag = vapply(tag_list, function(x) gsub('^Token\\[\\d+\\]: ".*" \u2192 (.*) \\(.*\\)', '\\1', x), character(1)), +#' precision = as.numeric(vapply(tag_list, function(x) gsub(".*\\((.*)\\)", "\\1", x), character(1))) +#' )) +#' } +#' } +#' +#' process_batch <- function(start_idx) { +#' if (verbose) { +#' cat(paste("Processing batch starting at index:", start_idx, "\n")) +#' } +#' +#' batch_texts <- texts[start_idx:(start_idx+batch_size-1)] +#' batch_ids <- doc_ids[start_idx:(start_idx+batch_size-1)] +#' batch_sentences <- lapply(batch_texts, Sentence) +#' lapply(batch_sentences, tagger$predict) +#' +#' dt_list <- lapply(seq_along(batch_sentences), function(i) { +#' process_single_sentence(batch_sentences[[i]], batch_ids[[i]]) +#' }) +#' +#' return(rbindlist(dt_list, fill = TRUE)) # Bind the results within this batch together +#' } +#' +#' # Split the data into batches and process each batch +#' n <- length(texts) +#' idxs <- seq(1, n, by = batch_size) +#' results_list <- lapply(idxs, process_batch) #' -#' @import reticulate -#' @export +#' # Combine the results from all batches +#' results_dt <- rbindlist(results_list, fill = TRUE) #' -#' @examples -#' \dontrun{ -#' library(reticulate) -#' library(fliaR) -#' tagger_pos_fast <- load_tagger_pos('pos-fast') -#' texts <- c("UCD is one of the best universities in Ireland.", -#' "Essex is not in the Russell Group, but it is famous for political science research.", -#' "TCD is the oldest university in Ireland.") -#' doc_ids <- c("doc1", "doc2", "doc3") +#' check_and_gc(gc.active) #' -#' # Using the batch_size parameter -#' get_pos_batch(texts, doc_ids, tagger_pos_fast, batch_size = 2) +#' return(results_dt) #' } - -get_pos_batch <- function(texts, doc_ids, tagger = NULL, language = NULL, - show.text_id = FALSE, gc.active = FALSE, - batch_size = 5, device = "cpu", verbose = TRUE) { - - # Check environment pre-requisites and parameters - check_prerequisites() - check_texts_and_ids(texts, doc_ids) - check_show.text_id(show.text_id) - check_device(device) - check_batch_size(batch_size) - - # Import the `Sentence` tokenizer and `Classifier` from Python's Flair - flair <- reticulate::import("flair") - Sentence <- flair$data$Sentence - - # Load tagger if null - if (is.null(tagger)) { - tagger <- load_tagger_pos(language) - } - - # Function to process a single sentence - process_single_sentence <- function(sentence, doc_id) { - text <- sentence$text - tag_list <- sentence$labels - - # Check if there are no pos tag in tag_list if tag_list empty returns NAs - if (length(tag_list) == 0) { - return(data.table(doc_id = doc_id, - token_id = NA, - text_id = ifelse(show.text_id, text, NA), - token = NA, - tag = NA, - precision = NA)) - } else { - return(data.table( - doc_id = rep(doc_id, length(tag_list)), - token_id = as.numeric(vapply(tag_list, function(x) gsub("^Token\\[([0-9]+)\\].*$", "\\1", x), character(1))), - text_id = ifelse(show.text_id, text, NA), - token = vapply(tag_list, function(x) gsub('^Token\\[\\d+\\]: "(.*)" .*', '\\1', x), character(1)), - tag = vapply(tag_list, function(x) gsub('^Token\\[\\d+\\]: ".*" \u2192 (.*) \\(.*\\)', '\\1', x), character(1)), - precision = as.numeric(vapply(tag_list, function(x) gsub(".*\\((.*)\\)", "\\1", x), character(1))) - )) - } - } - - process_batch <- function(start_idx) { - if (verbose) { - cat(paste("Processing batch starting at index:", start_idx, "\n")) - } - - batch_texts <- texts[start_idx:(start_idx+batch_size-1)] - batch_ids <- doc_ids[start_idx:(start_idx+batch_size-1)] - batch_sentences <- lapply(batch_texts, Sentence) - lapply(batch_sentences, tagger$predict) - - dt_list <- lapply(seq_along(batch_sentences), function(i) { - process_single_sentence(batch_sentences[[i]], batch_ids[[i]]) - }) - - return(rbindlist(dt_list, fill = TRUE)) # Bind the results within this batch together - } - - # Split the data into batches and process each batch - n <- length(texts) - idxs <- seq(1, n, by = batch_size) - results_list <- lapply(idxs, process_batch) - - # Combine the results from all batches - results_dt <- rbindlist(results_list, fill = TRUE) - - check_and_gc(gc.active) - - return(results_dt) -} diff --git a/R/get_sentiments.R b/R/get_sentiments.R index 4c156839..85184a94 100644 --- a/R/get_sentiments.R +++ b/R/get_sentiments.R @@ -1,115 +1,115 @@ -#' @title Tagging Sentiment with Flair Standard Models +#' #' @title Tagging Sentiment with Flair Standard Models +#' #' +#' #' @description This function takes in texts and their associated document IDs +#' #' to predict sentiments using the flair Python library. +#' #' +#' #' @param texts A list or vector of texts for which sentiment prediction is +#' #' to be made. +#' #' @param doc_ids A list or vector of document IDs corresponding to the texts. +#' #' @param language A character string indicating the language of the texts. +#' #' Currently supports "sentiment" (English), "sentiment-fast" (English), and +#' #' "de-offensive-language" (German) +#' #' @param tagger An optional flair sentiment model. If NULL (default), +#' #' the function loads the default model based on the language. +#' #' @param ... Additional arguments passed to next. +#' #' @param show.text_id A logical value. If TRUE, includes the actual text from +#' #' which the sentiment was predicted. Default is FALSE. +#' #' @param gc.active A logical value. If TRUE, runs the garbage collector after +#' #' processing all texts. This can help in freeing up memory by releasing unused +#' #' memory space, especially when processing a large number of texts. +#' #' Default is FALSE. +#' #' +#' #' @return A `data.table` containing three columns: +#' #' \itemize{ +#' #' \item `doc_id`: The document ID from the input. +#' #' \item `sentiment`: Predicted sentiment for the text. +#' #' \item `score`: Score for the sentiment prediction. +#' #' } +#' #' +#' #' @examples +#' #' \dontrun{ +#' #' library(flaiR) +#' #' texts <- c("UCD is one of the best universities in Ireland.", +#' #' "UCD has a good campus but is very far from my apartment in Dublin.", +#' #' "Essex is famous for social science research.", +#' #' "Essex is not in the Russell Group, but it is famous for political science research.", +#' #' "TCD is the oldest university in Ireland.", +#' #' "TCD is similar to Oxford.") +#' #' +#' #' doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") +#' #' +#' #' # Load re-trained sentiment ("sentiment") model +#' #' tagger_sent <- load_tagger_sentiments('sentiment') +#' #' +#' #' results <- get_sentiments(texts, doc_ids, tagger_sent) +#' #' print(results) +#' #' } +#' #' +#' #' @importFrom data.table data.table +#' #' @importFrom reticulate py_available py_module_available import +#' #' @importFrom data.table := +#' #' @export +#' get_sentiments <- function(texts, doc_ids, +#' tagger = NULL, +#' ..., +#' language = NULL, +#' show.text_id = FALSE, gc.active = FALSE ) { +#' # Check environment pre-requisites +#' check_prerequisites() +#' check_show.text_id(show.text_id) #' -#' @description This function takes in texts and their associated document IDs -#' to predict sentiments using the flair Python library. +#' # Check and prepare texts and doc_ids +#' texts_and_ids <- check_texts_and_ids(texts, doc_ids) +#' texts <- texts_and_ids$texts +#' doc_ids <- texts_and_ids$doc_ids #' -#' @param texts A list or vector of texts for which sentiment prediction is -#' to be made. -#' @param doc_ids A list or vector of document IDs corresponding to the texts. -#' @param language A character string indicating the language of the texts. -#' Currently supports "sentiment" (English), "sentiment-fast" (English), and -#' "de-offensive-language" (German) -#' @param tagger An optional flair sentiment model. If NULL (default), -#' the function loads the default model based on the language. -#' @param ... Additional arguments passed to next. -#' @param show.text_id A logical value. If TRUE, includes the actual text from -#' which the sentiment was predicted. Default is FALSE. -#' @param gc.active A logical value. If TRUE, runs the garbage collector after -#' processing all texts. This can help in freeing up memory by releasing unused -#' memory space, especially when processing a large number of texts. -#' Default is FALSE. +#' # Load the Sentence tokenizer from the Flair library in Python. +#' flair <- reticulate::import("flair") +#' Classifier <- flair$nn$Classifier +#' Sentence <- flair$data$Sentence #' -#' @return A `data.table` containing three columns: -#' \itemize{ -#' \item `doc_id`: The document ID from the input. -#' \item `sentiment`: Predicted sentiment for the text. -#' \item `score`: Score for the sentiment prediction. -#' } +#' # Load tagger if null +#' if (is.null(tagger)) { +#' tagger <- load_tagger_sentiments(language) +#' } +#' +#' # Function to process each text +#' process_text <- function(text, doc_id) { +#' text_id <- NULL +#' if (is.na(text)) { +#' return(data.table(doc_id = doc_id, text_id = NA, sentiment = NA, score = NA)) +#' } +#' +#' # Create a sentence using provided text +#' sentence <- Sentence(text) #' -#' @examples -#' \dontrun{ -#' library(flaiR) -#' texts <- c("UCD is one of the best universities in Ireland.", -#' "UCD has a good campus but is very far from my apartment in Dublin.", -#' "Essex is famous for social science research.", -#' "Essex is not in the Russell Group, but it is famous for political science research.", -#' "TCD is the oldest university in Ireland.", -#' "TCD is similar to Oxford.") +#' # Predict sentiment +#' tagger$predict(sentence) #' -#' doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") +#' # Check if there's a predicted label +#' if (length(sentence$get_labels()) > 0) { +#' sentiment_label <- sentence$get_labels()[[1]]$value +#' sentiment_score <- sentence$get_labels()[[1]]$score +#' } else { +#' sentiment_label <- NA +#' sentiment_score <- NA +#' } #' -#' # Load re-trained sentiment ("sentiment") model -#' tagger_sent <- load_tagger_sentiments('sentiment') +#' dt <- data.table(doc_id = doc_id, +#' sentiment = sentiment_label, +#' score = sentiment_score) #' -#' results <- get_sentiments(texts, doc_ids, tagger_sent) -#' print(results) -#' } +#' if (isTRUE(show.text_id)) { +#' dt[, text_id := text] +#' } #' -#' @importFrom data.table data.table -#' @importFrom reticulate py_available py_module_available import -#' @importFrom data.table := -#' @export -get_sentiments <- function(texts, doc_ids, - tagger = NULL, - ..., - language = NULL, - show.text_id = FALSE, gc.active = FALSE ) { - # Check environment pre-requisites - check_prerequisites() - check_show.text_id(show.text_id) - - # Check and prepare texts and doc_ids - texts_and_ids <- check_texts_and_ids(texts, doc_ids) - texts <- texts_and_ids$texts - doc_ids <- texts_and_ids$doc_ids - - # Load the Sentence tokenizer from the Flair library in Python. - flair <- reticulate::import("flair") - Classifier <- flair$nn$Classifier - Sentence <- flair$data$Sentence - - # Load tagger if null - if (is.null(tagger)) { - tagger <- load_tagger_sentiments(language) - } - - # Function to process each text - process_text <- function(text, doc_id) { - text_id <- NULL - if (is.na(text)) { - return(data.table(doc_id = doc_id, text_id = NA, sentiment = NA, score = NA)) - } - - # Create a sentence using provided text - sentence <- Sentence(text) - - # Predict sentiment - tagger$predict(sentence) - - # Check if there's a predicted label - if (length(sentence$get_labels()) > 0) { - sentiment_label <- sentence$get_labels()[[1]]$value - sentiment_score <- sentence$get_labels()[[1]]$score - } else { - sentiment_label <- NA - sentiment_score <- NA - } - - dt <- data.table(doc_id = doc_id, - sentiment = sentiment_label, - score = sentiment_score) - - if (isTRUE(show.text_id)) { - dt[, text_id := text] - } - - return(dt) - } - - results_list <- lapply(seq_along(texts), function(i) process_text(texts[i], doc_ids[i])) - results_dt <- rbindlist(results_list, fill=TRUE) - - # Activate garbage collection - check_and_gc(gc.active) - return(results_dt) - } +#' return(dt) +#' } +#' +#' results_list <- lapply(seq_along(texts), function(i) process_text(texts[i], doc_ids[i])) +#' results_dt <- rbindlist(results_list, fill=TRUE) +#' +#' # Activate garbage collection +#' check_and_gc(gc.active) +#' return(results_dt) +#' } diff --git a/R/get_sentiments_batch.R b/R/get_sentiments_batch.R index 6d9dca08..cc2cd619 100644 --- a/R/get_sentiments_batch.R +++ b/R/get_sentiments_batch.R @@ -1,146 +1,146 @@ -#' @title Batch Process of Tagging Sentiment with Flair Models +#' #' @title Batch Process of Tagging Sentiment with Flair Models +#' #' +#' #' @description This function takes in texts and their associated document IDs +#' #' to predict sentiments using the flair Python library. +#' #' +#' #' @param texts A list or vector of texts for which sentiment prediction is +#' #' to be made. +#' #' @param doc_ids A list or vector of document IDs corresponding to the texts. +#' #' @param language A character string indicating the language of the texts. +#' #' Currently supports "sentiment" (English), "sentiment-fast" (English), and +#' #' "de-offensive-language" (German) +#' #' @param tagger An optional flair sentiment model. If NULL (default), +#' #' the function loads the default model based on the language. +#' #' @param ... Additional arguments passed to next. +#' #' @param show.text_id A logical value. If TRUE, includes the actual text from +#' #' which the sentiment was predicted. Default is FALSE. +#' #' @param gc.active A logical value. If TRUE, runs the garbage collector after +#' #' processing all texts. This can help in freeing up memory by releasing unused +#' #' memory space, especially when processing a large number of texts. +#' #' Default is FALSE. +#' #' @param batch_size An integer specifying the number of texts to be processed +#' #' at once. It can help optimize performance by leveraging parallel processing. +#' #' Default is 5. +#' #' @param device A character string specifying the computation device. +#' #' It can be either "cpu" or a string representation of a GPU device number. +#' #' For instance, "0" corresponds to the first GPU. If a GPU device number +#' #' is provided, it will attempt to use that GPU. The default is "cpu". +#' #' \itemize{ +#' #' \item{"cuda" or "cuda:0" ("mps" or "mps:0" in Mac M1/M2 )}{Refers to the first GPU in the system. If +#' #' there's only one GPU, specifying "cuda" or "cuda:0" will allocate +#' #' computations to this GPU.} +#' #' \item{"cuda:1" ("mps:1")}{Refers to the second GPU in the system, allowing allocation +#' #' of specific computations to this GPU.} +#' #' \item{"cuda:2" ("mps:2)}{Refers to the third GPU in the system, and so on for systems +#' #' with more GPUs.} +#' #' } +#' #' +#' #' @param verbose A logical value. If TRUE, the function prints batch processing +#' #' progress updates. Default is TRUE. +#' #' +#' #' @return A `data.table` containing three columns: +#' #' \itemize{ +#' #' \item `doc_id`: The document ID from the input. +#' #' \item `sentiment`: Predicted sentiment for the text. +#' #' \item `score`: Score for the sentiment prediction. +#' #' } +#' #' +#' #' @examples +#' #' \dontrun{ +#' #' library(flaiR) +#' #' +#' #' +#' #' texts <- c("UCD is one of the best universities in Ireland.", +#' #' "UCD has a good campus but is very far from my apartment in Dublin.", +#' #' "Essex is famous for social science research.", +#' #' "Essex is not in the Russell Group, but it is famous for political science research.", +#' #' "TCD is the oldest university in Ireland.", +#' #' "TCD is similar to Oxford.") +#' #' +#' #' doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") +#' #' +#' #' # Load re-trained sentiment ("sentiment") model +#' #' tagger_sent <- load_tagger_sentiments('sentiment') +#' #' +#' #' results <- get_sentiments_batch(texts, doc_ids, tagger_sent, batch_size = 3) +#' #' print(results) +#' #' } +#' #' +#' #' @importFrom data.table data.table +#' #' @importFrom reticulate py_available py_module_available import +#' #' @importFrom data.table := +#' #' @export +#' get_sentiments_batch <- function(texts, doc_ids, +#' tagger = NULL, ..., language = NULL, +#' show.text_id = FALSE, gc.active = FALSE, +#' batch_size = 5, device = "cpu", verbose = FALSE) { +#' # Check environment pre-requisites and parameters +#' check_prerequisites() +#' check_device(device) +#' check_batch_size(batch_size) +#' check_texts_and_ids(texts, doc_ids) +#' check_show.text_id(show.text_id) #' -#' @description This function takes in texts and their associated document IDs -#' to predict sentiments using the flair Python library. +#' # Load the Sentence tokenizer from the Flair library in Python. +#' flair <- reticulate::import("flair") +#' Classifier <- flair$nn$Classifier +#' Sentence <- flair$data$Sentence #' -#' @param texts A list or vector of texts for which sentiment prediction is -#' to be made. -#' @param doc_ids A list or vector of document IDs corresponding to the texts. -#' @param language A character string indicating the language of the texts. -#' Currently supports "sentiment" (English), "sentiment-fast" (English), and -#' "de-offensive-language" (German) -#' @param tagger An optional flair sentiment model. If NULL (default), -#' the function loads the default model based on the language. -#' @param ... Additional arguments passed to next. -#' @param show.text_id A logical value. If TRUE, includes the actual text from -#' which the sentiment was predicted. Default is FALSE. -#' @param gc.active A logical value. If TRUE, runs the garbage collector after -#' processing all texts. This can help in freeing up memory by releasing unused -#' memory space, especially when processing a large number of texts. -#' Default is FALSE. -#' @param batch_size An integer specifying the number of texts to be processed -#' at once. It can help optimize performance by leveraging parallel processing. -#' Default is 5. -#' @param device A character string specifying the computation device. -#' It can be either "cpu" or a string representation of a GPU device number. -#' For instance, "0" corresponds to the first GPU. If a GPU device number -#' is provided, it will attempt to use that GPU. The default is "cpu". -#' \itemize{ -#' \item{"cuda" or "cuda:0" ("mps" or "mps:0" in Mac M1/M2 )}{Refers to the first GPU in the system. If -#' there's only one GPU, specifying "cuda" or "cuda:0" will allocate -#' computations to this GPU.} -#' \item{"cuda:1" ("mps:1")}{Refers to the second GPU in the system, allowing allocation -#' of specific computations to this GPU.} -#' \item{"cuda:2" ("mps:2)}{Refers to the third GPU in the system, and so on for systems -#' with more GPUs.} -#' } +#' # Load tagger if null +#' if (is.null(tagger)) { +#' tagger <- load_tagger_sentiments(language) +#' } +#' +#' # `process_batch` to batch process +#' process_batch <- function(texts_batch, doc_ids_batch) { +#' text_id <- NULL +#' sentences <- lapply(texts_batch, flair$data$Sentence) +#' +#' # Predict sentiments for the entire batch +#' tagger$predict(sentences) +#' +#' results <- lapply(seq_along(sentences), function(i) { +#' sentence <- sentences[[i]] +#' doc_id <- doc_ids_batch[i] +#' +#' if (length(sentence$get_labels()) > 0) { +#' sentiment_label <- sentence$get_labels()[[1]]$value +#' sentiment_score <- sentence$get_labels()[[1]]$score +#' } else { +#' sentiment_label <- NA +#' sentiment_score <- NA +#' } #' -#' @param verbose A logical value. If TRUE, the function prints batch processing -#' progress updates. Default is TRUE. +#' dt <- data.table(doc_id = doc_id, +#' sentiment = sentiment_label, +#' score = sentiment_score) +#' if (isTRUE(show.text_id)) { +#' dt[, text_id := texts_batch[i]] +#' } +#' dt +#' }) #' -#' @return A `data.table` containing three columns: -#' \itemize{ -#' \item `doc_id`: The document ID from the input. -#' \item `sentiment`: Predicted sentiment for the text. -#' \item `score`: Score for the sentiment prediction. +#' return(rbindlist(results, fill = TRUE)) #' } #' -#' @examples -#' \dontrun{ -#' library(flaiR) +#' # Split texts into batches and process +#' num_batches <- ceiling(length(texts) / batch_size) +#' results_list <- lapply(1:num_batches, function(i) { +#' start_idx <- (i-1)*batch_size + 1 +#' end_idx <- min(i*batch_size, length(texts)) #' +#' if (isTRUE(verbose)) { +#' cat(sprintf("Processing batch %d out of %d...\n", i, num_batches)) +#' } #' -#' texts <- c("UCD is one of the best universities in Ireland.", -#' "UCD has a good campus but is very far from my apartment in Dublin.", -#' "Essex is famous for social science research.", -#' "Essex is not in the Russell Group, but it is famous for political science research.", -#' "TCD is the oldest university in Ireland.", -#' "TCD is similar to Oxford.") +#' process_batch(texts[start_idx:end_idx], doc_ids[start_idx:end_idx]) +#' }) #' -#' doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") +#' results_dt <- rbindlist(results_list, fill = TRUE) #' -#' # Load re-trained sentiment ("sentiment") model -#' tagger_sent <- load_tagger_sentiments('sentiment') +#' # Activate garbage collection +#' check_and_gc(gc.active) #' -#' results <- get_sentiments_batch(texts, doc_ids, tagger_sent, batch_size = 3) -#' print(results) +#' return(results_dt) #' } -#' -#' @importFrom data.table data.table -#' @importFrom reticulate py_available py_module_available import -#' @importFrom data.table := -#' @export -get_sentiments_batch <- function(texts, doc_ids, - tagger = NULL, ..., language = NULL, - show.text_id = FALSE, gc.active = FALSE, - batch_size = 5, device = "cpu", verbose = FALSE) { - # Check environment pre-requisites and parameters - check_prerequisites() - check_device(device) - check_batch_size(batch_size) - check_texts_and_ids(texts, doc_ids) - check_show.text_id(show.text_id) - - # Load the Sentence tokenizer from the Flair library in Python. - flair <- reticulate::import("flair") - Classifier <- flair$nn$Classifier - Sentence <- flair$data$Sentence - - # Load tagger if null - if (is.null(tagger)) { - tagger <- load_tagger_sentiments(language) - } - - # `process_batch` to batch process - process_batch <- function(texts_batch, doc_ids_batch) { - text_id <- NULL - sentences <- lapply(texts_batch, flair$data$Sentence) - - # Predict sentiments for the entire batch - tagger$predict(sentences) - - results <- lapply(seq_along(sentences), function(i) { - sentence <- sentences[[i]] - doc_id <- doc_ids_batch[i] - - if (length(sentence$get_labels()) > 0) { - sentiment_label <- sentence$get_labels()[[1]]$value - sentiment_score <- sentence$get_labels()[[1]]$score - } else { - sentiment_label <- NA - sentiment_score <- NA - } - - dt <- data.table(doc_id = doc_id, - sentiment = sentiment_label, - score = sentiment_score) - if (isTRUE(show.text_id)) { - dt[, text_id := texts_batch[i]] - } - dt - }) - - return(rbindlist(results, fill = TRUE)) - } - - # Split texts into batches and process - num_batches <- ceiling(length(texts) / batch_size) - results_list <- lapply(1:num_batches, function(i) { - start_idx <- (i-1)*batch_size + 1 - end_idx <- min(i*batch_size, length(texts)) - - if (isTRUE(verbose)) { - cat(sprintf("Processing batch %d out of %d...\n", i, num_batches)) - } - - process_batch(texts[start_idx:end_idx], doc_ids[start_idx:end_idx]) - }) - - results_dt <- rbindlist(results_list, fill = TRUE) - - # Activate garbage collection - check_and_gc(gc.active) - - return(results_dt) -} diff --git a/R/note.R b/R/note.R index af3bb6e7..9c18c0d1 100644 --- a/R/note.R +++ b/R/note.R @@ -150,3 +150,357 @@ # packageStartupMessage("Flair NLP can be successfully imported in R via {flaiR} ! \U1F44F") # } # } + + + +#' #' @title Load the Named Entity Recognition (NER) Tagger +#' #' +#' #' @description A helper function to load the appropriate tagger based on the provided language. +#' #' This function supports a variety of languages/models. +#' #' +#' #' @param language A character string indicating the desired language for the NER tagger. +#' #' If `NULL`, the function will default to the 'pos-fast' model. +#' #' Supported languages and their models include: +#' #' \itemize{ +#' #' \item `"en"` - English NER tagging (`ner`) +#' #' \item `"de"` - German NER tagging (`de-ner`) +#' #' \item `"fr"` - French NER tagging (`fr-ner`) +#' #' \item `"nl"` - Dutch NER tagging (`nl-ner`) +#' #' \item `"da"` - Danish NER tagging (`da-ner`) +#' #' \item `"ar"` - Arabic NER tagging (`ar-ner`) +#' #' \item `"ner-fast"` - English NER fast model (`ner-fast`) +#' #' \item `"ner-large"` - English NER large mode (`ner-large`) +#' #' \item `"de-ner-legal"` - NER (legal text) (`de-ner-legal`) +#' #' \item `"nl"` - Dutch NER tagging (`nl-ner`) +#' #' \item `"da"` - Danish NER tagging (`da-ner`) +#' #' \item `"ar"` - Arabic NER tagging (`ar-ner`) +#' #'} +#' #' +#' #' @return An instance of the Flair SequenceTagger for the specified language. +#' #' +#' #' @import reticulate +#' #' @importFrom stats setNames +#' #' +#' #' @examples +#' #' # Load the English NER tagger +#' #' tagger_en <- load_tagger_ner("en") +#' #' +#' #' @export +#' load_tagger_ner <- function(language = NULL) { +#' supported_lan_models <- c("ner", "de-ner", +#' "fr-ner", "nl-ner", +#' "da-ner", "ar-ner", +#' "ner-fast", "ner-large", +#' "ner-pooled", "ner-ontonotes", +#' "ner-ontonotes-fast", "ner-ontonotes-large", +#' "de-ner-large", "de-ner-germeval", +#' "de-ner-legal", "es-ner", +#' "nl-ner", "nl-ner-large", +#' "nl-ner-rnn", "ner-ukrainian") +#' language_model_map <- setNames(supported_lan_models, c("en", "de", +#' "fr", "nl", +#' "da", "ar", +#' "ner-fast", "ner-large", +#' "ner-pooled", "ner-ontonotes", +#' "ner-ontonotes-fast", "ner-ontonotes-large", +#' "de-ner-large", "de-ner-germeval", +#' "de-ner-legal", "es-ner-large", +#' "nl-ner", "nl-ner-large", +#' "nl-ner-rnn", "ner-ukrainian") +#' ) +#' +#' if (is.null(language)) { +#' language <- "en" +#' message("Language is not specified. ", language, " in Flair is forceloaded. Please ensure that the internet connectivity is stable.") +#' } +#' +#' # Translate language to model name if necessary +#' if (language %in% names(language_model_map)) { +#' language <- language_model_map[[language]] +#' } +#' +#' # Ensure the model is supported +#' check_language_supported(language = language, supported_lan_models = supported_lan_models) +#' +#' # Load the model +#' SequenceTagger <- reticulate::import("flair.models")$SequenceTagger +#' SequenceTagger$load(language) +#' } +#' +#' +#' + + + +#' #' @title Tagging Named Entities with Flair Models +#' #' +#' #' @description This function takes texts and their corresponding document IDs +#' #' as inputs, uses the Flair NLP library to extract named entities, +#' #' and returns a dataframe of the identified entities along with their tags. +#' #' When no entities are detected in a text, the function returns a data table +#' #' with NA values. This might clutter the results. Depending on your use case, +#' #' you might decide to either keep this behavior or skip rows with no detected +#' #' entities. +#' #' +#' #' @param texts A character vector containing the texts to process. +#' #' @param doc_ids A character or numeric vector containing the document IDs +#' #' corresponding to each text. +#' #' @param tagger An optional tagger object. If NULL (default), the function will +#' #' load a Flair tagger based on the specified language. +#' #' @param language A character string indicating the language model to load. +#' #' Default is "en". +#' #' @param show.text_id A logical value. If TRUE, includes the actual text from +#' #' which the entity was extracted in the resulting data table. Useful for +#' #' verification and traceability purposes but might increase the size of +#' #' the output. Default is FALSE. +#' #' @param gc.active A logical value. If TRUE, runs the garbage collector after +#' #' processing all texts. This can help in freeing up memory by releasing unused +#' #' memory space, especially when processing a large number of texts. +#' #' Default is FALSE. +#' #' @return A data table with columns: +#' #' \describe{ +#' #' \item{doc_id}{The ID of the document from which the entity was extracted.} +#' #' \item{text_id}{If TRUE, the actual text from which the entity +#' #' was extracted.} +#' #' \item{entity}{The named entity that was extracted from the text.} +#' #' \item{tag}{The tag or category of the named entity. Common tags include: +#' #' PERSON (names of individuals), +#' #' ORG (organizations, institutions), +#' #' GPE (countries, cities, states), +#' #' LOCATION (mountain ranges, bodies of water), +#' #' DATE (dates or periods), +#' #' TIME (times of day), +#' #' MONEY (monetary values), +#' #' PERCENT (percentage values), +#' #' FACILITY (buildings, airports), +#' #' PRODUCT (objects, vehicles), +#' #' EVENT (named events like wars or sports events), +#' #' ART (titles of books)}} +#' #' @examples +#' #' \dontrun{ +#' #' library(reticulate) +#' #' library(fliaR) +#' #' +#' #' texts <- c("UCD is one of the best universities in Ireland.", +#' #' "UCD has a good campus but is very far from +#' #' my apartment in Dublin.", +#' #' "Essex is famous for social science research.", +#' #' "Essex is not in the Russell Group, but it is +#' #' famous for political science research.", +#' #' "TCD is the oldest university in Ireland.", +#' #' "TCD is similar to Oxford.") +#' #' doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") +#' #' # Load NER ("ner") model +#' #' tagger_ner <- load_tagger_ner('ner') +#' #' results <- get_entities(texts, doc_ids, tagger_ner) +#' #' print(results)} +#' #' +#' #' @importFrom data.table data.table rbindlist +#' #' @importFrom reticulate import +#' #' @importFrom data.table := +#' #' @export +# get_entities <- function(texts, doc_ids = NULL, tagger = NULL, language = NULL, +# show.text_id = FALSE, gc.active = FALSE) { +# +# # Check environment pre-requisites +# check_prerequisites() +# check_show.text_id(show.text_id) +# +# # Check and prepare texts and doc_ids +# texts_and_ids <- check_texts_and_ids(texts, doc_ids) +# texts <- texts_and_ids$texts +# doc_ids <- texts_and_ids$doc_ids +# +# # Load tagger if null +# if (is.null(tagger)) { +# tagger <- load_tagger_ner(language) +# } +# +# Sentence <- reticulate::import("flair")$data$Sentence +# +# # Process each text and extract entities +# process_text <- function(text, doc_id) { +# text_id <- NULL +# if (is.na(text) || is.na(doc_id)) { +# return(data.table(doc_id = NA, entity = NA, tag = NA)) +# } +# +# sentence <- Sentence(text) +# tagger$predict(sentence) +# entities <- sentence$get_spans("ner") +# +# if (length(entities) == 0) { +# return(data.table(doc_id = doc_id, entity = NA, tag = NA)) +# } +# +# # Unified data table creation process +# dt <- data.table( +# doc_id = rep(doc_id, length(entities)), +# entity = vapply(entities, function(e) e$text, character(1)), +# tag = vapply(entities, function(e) e$tag, character(1)) +# ) +# +# if (isTRUE(show.text_id)) { +# dt[, text_id := text] +# } +# +# return(dt) +# } +# # Activate garbage collection +# check_and_gc(gc.active) +# +# results_list <- lapply(seq_along(texts), +# function(i) {process_text(texts[[i]], doc_ids[[i]])}) +# rbindlist(results_list, fill = TRUE) +# } + + + +#' @title Load NER Tagger Model +#' +#' @description Loads a Named Entity Recognition (NER) model from Flair. +#' Verifies that the loaded model is a valid NER model. +#' +#' @param model_name Character string specifying the model name to load. +#' Default is "ner" (English NER model). +#' +#' @return A Flair SequenceTagger model object for NER. +#' @examples +#' \dontrun{ +#' # Load default English NER model +#' tagger <- load_tagger_ner() +#' +#' # Load specific model +#' tagger <- load_tagger_ner("ner-fast") +#' } +#' @export +# load_tagger_ner <- function(model_name = "ner") { +# if (is.null(model_name)) { +# model_name <- "ner" +# message("Model name is not specified. Using default 'ner' model.") +# } +# +# # Load the model +# tryCatch({ +# SequenceTagger <- flair_models()$SequenceTagger +# tagger <- SequenceTagger$load("flair/ner-english-large") +# +# # Verify the model is for NER +# # Check if the model has NER-related tags +# tags <- tagger$labels +# if (!any(grepl("PER|ORG|LOC|GPE|MISC", tags))) { +# warning("The loaded model may not be a proper NER model. ", +# "Please ensure it's designed for Named Entity Recognition.") +# } +# +# return(tagger) +# }, error = function(e) { +# stop("Error loading model: ", model_name, "\n", e$message) +# }) +# } +#' +#' #' @title Load and Configure NER Tagger +#' #' +#' #' @description Loads a Named Entity Recognition model from Flair and displays +#' #' its tag dictionary. +#' #' +#' #' @param model_name Character string specifying the model to load. +#' #' Default is "ner". +#' #' @param show_tags Logical, whether to display the tag dictionary. +#' #' Default is TRUE. +#' #' +#' #' @return A Flair SequenceTagger model object +#' #' @export +#' load_tagger_ner <- function(model_name = "ner", show_tags = TRUE) { +#' if (is.null(model_name)) { +#' model_name <- "ner" +#' message("Model name is not specified. Using default 'ner' model.") +#' } +#' +#' # Load the model +#' tryCatch({ +#' SequenceTagger <- flair_models()$SequenceTagger +#' tagger <- SequenceTagger$load("flair/ner-english-large") +#' +#' # Extract and organize tags if requested +#' if (show_tags) { +#' tag_dict <- tagger$label_dictionary +#' tag_list <- tag_dict$get_items() +#' +#' cat("\nNER Tagger Dictionary:\n") +#' cat("========================================\n") +#' cat(sprintf("Total tags: %d\n", length(tag_list))) +#' cat("----------------------------------------\n") +#' # Group and print tags by type +#' special_tags <- grep("^<.*>$|^O$", tag_list, value = TRUE) +#' person_tags <- grep("PER", tag_list, value = TRUE) +#' org_tags <- grep("ORG", tag_list, value = TRUE) +#' loc_tags <- grep("LOC", tag_list, value = TRUE) +#' misc_tags <- grep("MISC", tag_list, value = TRUE) +#' +#' if (length(special_tags) > 0) cat("Special:", paste(special_tags, collapse = ", "), "\n") +#' if (length(person_tags) > 0) cat("Person:", paste(person_tags, collapse = ", "), "\n") +#' if (length(org_tags) > 0) cat("Organization:", paste(org_tags, collapse = ", "), "\n") +#' if (length(loc_tags) > 0) cat("Location:", paste(loc_tags, collapse = ", "), "\n") +#' if (length(misc_tags) > 0) cat("Miscellaneous:", paste(misc_tags, collapse = ", "), "\n") +#' +#' cat("----------------------------------------\n") +#' cat("Tag scheme: BIOES\n") +#' cat("B-: Beginning of entity\n") +#' cat("I-: Inside of entity\n") +#' cat("O: Outside (not an entity)\n") +#' cat("E-: End of entity\n") +#' cat("S-: Single token entity\n") +#' cat("========================================\n") +#' } +#' +#' return(tagger) +#' }, error = function(e) { +#' stop("Error loading model: ", model_name, "\n", e$message) +#' }) +#' } +#' +#' + +#' +#' +#' #' @title Load a Sentiment or Language Tagger Model from Flair +#' #' +#' #' @description This function loads a pre-trained sentiment or language tagger +#' #' from the Flair library. +#' #' +#' #' @param language A character string specifying the language model to load. +#' #' Supported models include: +#' #' \itemize{ +#' #' \item "sentiment" - Sentiment analysis model +#' #' \item "sentiment-fast" - Faster sentiment analysis model +#' #' \item "de-offensive-language" - German offensive language detection model +#' #'} If not provided, the function will default to the "sentiment" model. +#' #' +#' #' @return An object of the loaded Flair model. +#' #' +#' #' @import reticulate +#' #' @examples +#' #' \dontrun{ +#' #' tagger <- load_tagger_sentiments("sentiment") +#' #' } +#' #' +#' #' @export +#' load_tagger_sentiments <- function(language = NULL) { +#' supported_lan_models <- c("sentiment", "sentiment-fast", "de-offensive-language") +#' +#' if (is.null(language)) { +#' language <- "sentiment" +#' message("Language is not specified. ", language, " in Flair is forceloaded. Please ensure that the internet connectivity is stable.") +#' } +#' +#' # Ensure the model is supported +#' check_language_supported(language = language, supported_lan_models = supported_lan_models) +#' +#' # Load the model +#' flair <- reticulate::import("flair") +#' Classifier <- flair$nn$Classifier +#' tagger <- Classifier$load(language) +#' return(tagger) +#' } + diff --git a/R/predict_label.R b/R/predict_label.R new file mode 100644 index 00000000..a8108ae1 --- /dev/null +++ b/R/predict_label.R @@ -0,0 +1,88 @@ +#' Predict Text Label Using Flair Classifier +#' +#' This function predicts the label of input text using a Flair classifier, +#' with options for confidence thresholding to adjust predictions to NEUTRAL. +#' +#' @param text A character string containing the text to be labeled +#' @param classifier A Flair TextClassifier object for making predictions +#' @param threshold_score Numeric value between 0 and 1 representing the confidence +#' threshold for label classification. Defaults to 0.5 if not specified +#' @param threshold Logical indicating whether to apply the threshold_score to +#' adjust predictions to NEUTRAL. Defaults to FALSE +#' +#' @return A list containing the following elements: +#' \describe{ +#' \item{label}{Character string of the final predicted label (AGAINST/FAVOR/NEUTRAL)} +#' \item{score}{Numeric confidence score from the classifier} +#' \item{token_number}{Integer count of tokens in the input text} +#' \item{threshold_score}{Numeric value of the threshold used} +#' \item{original_label}{Character string of the classifier's original prediction +#' before thresholding} +#' } +#' +#' @examples +#' \dontrun{ +#' # Load a pre-trained classifier +#' classifier <- flair$models$TextClassifier$load('stance-classifier') +#' +#' # Predict label without thresholding +#' result1 <- predict_label( +#' text = "I strongly support this policy", +#' classifier = classifier +#' ) +#' +#' # Predict with custom threshold +#' result2 <- predict_label( +#' text = "I somewhat agree with the proposal", +#' classifier = classifier, +#' threshold_score = 0.7, +#' threshold = TRUE +#' ) +#' } +#' +#' @details +#' The function will throw an error if the classifier is NULL or not a +#' Flair TextClassifier. +#' +#' @import flaiR +#' @export +predict_label <- function(text, classifier, threshold_score = NULL, threshold = FALSE) { + # Check if classifier is provided and valid + if (is.null(classifier) || !isTRUE(class(classifier)[1] == "flair.models.text_classification_model.TextClassifier")) { + stop("Invalid or missing classifier. Please provide a pre-trained Flair TextClassifier model.") + } + + # Create a sentence object + sentence <- Sentence(text) + + # Use the classifier to predict + classifier$predict(sentence) + + # Get the predicted label and score + predicted_label <- sentence$labels[[1]]$value + score <- sentence$labels[[1]]$score # 移除 as.numeric + token_number <- length(sentence$tokens) + + # Set default threshold_score if NULL + if (is.null(threshold_score)) { + threshold_score <- 0.5 + } + + # Modify label based on the score threshold and original label + original_label <- predicted_label + if (threshold && score < threshold_score) { + if (predicted_label %in% c("AGAINST", "FAVOR")) { + predicted_label <- "NEUTRAL" + } + } + + # Construct the prediction result + prediction = list(label = predicted_label, + score = score, + token_number = token_number, + threshold_score = threshold_score, + original_label = original_label) + + # Return the prediction result + return(prediction) +} diff --git a/R/process_embeddings.R b/R/process_embeddings.R new file mode 100644 index 00000000..a64622d8 --- /dev/null +++ b/R/process_embeddings.R @@ -0,0 +1,144 @@ +#' Process Token Embeddings from Flair Sentence Object +#' +#' This function processes token embeddings from a Flair sentence object and converts them +#' into a matrix format with token names as row names. It handles the extraction of +#' embeddings from tokens, retrieval of token texts, and conversion to matrix format. +#' +#' @param sentence A Flair sentence object containing tokens with embeddings. +#' The sentence object should have a 'tokens' attribute, where each token +#' has both an 'embedding' (with numpy() method) and 'text' attribute. +#' @param verbose Logical indicating whether to print progress messages. Default is FALSE. +#' +#' @return A matrix where: +#' \itemize{ +#' \item Each row represents a token's embedding +#' \item Row names are the corresponding token texts +#' \item Columns represent the dimensions of the embedding vectors +#' } +#' +#' @details +#' The function will throw errors in the following cases: +#' \itemize{ +#' \item If sentence is NULL or has no tokens +#' \item If any token is missing an embedding +#' \item If any token is missing text +#' } +#' +#' @examples +#' \dontrun{ +#' # Create a Flair sentence +#' sentence <- Sentence("example text") +#' WordEmbeddings <- flair_embeddings()$WordEmbeddings +#' +#' # Initialize FastText embeddings trained on Common Crawl +#' fasttext_embeddings <- WordEmbeddings('en-crawl') +#' +#' # Apply embeddings +#' fasttext_embeddings$embed(sentence) +#' +#' # Process embeddings with timing and messages +#' embedding_matrix <- process_embeddings(sentence, verbose = TRUE) +#' } +#' +#' @import flaiR +#' @export +process_embeddings <- function(sentence, verbose = FALSE) { + # Start timing + start_time <- Sys.time() + + # Input validation + if (is.null(sentence) || is.null(sentence$tokens)) { + stop("Invalid input: sentence or tokens is NULL") + } + + # Check if embeddings are valid + if (!.has_valid_embeddings(sentence)) { + stop("Sentence needs to be embedded") + } + + # Extract and store embeddings for each token + if (verbose) message("Extracting token embeddings...") + sen_list <- list() + for (i in seq_along(sentence$tokens)) { + if (is.null(sentence$tokens[[i]]$embedding$numpy())) { + stop(sprintf("No embedding found for token at position %d", i)) + } + sen_list[[i]] <- as.vector(sentence$tokens[[i]]$embedding$numpy()) + } + + # Extract token texts for labeling + token_texts <- sapply(sentence$tokens, function(token) { + if (is.null(token$text)) { + stop("Token text is missing") + } + token$text + }) + + # Convert list of embeddings to matrix format + if (verbose) cat("Converting embeddings to matrix format...") + stacked_subset <- do.call(rbind, lapply(sen_list, function(x) { + matrix(x, nrow = 1) + })) + + # Add row names + rownames(stacked_subset) <- token_texts + + # Calculate processing time + end_time <- Sys.time() + processing_time <- round(as.numeric(difftime(end_time, start_time, units = "secs")), 3) + if (verbose) { + cat(sprintf("Processing completed in %s seconds\n", processing_time)) + cat(sprintf("Generated embedding matrix with %d tokens and %d dimensions\n", + nrow(stacked_subset), ncol(stacked_subset))) + } + + return(stacked_subset) +} + + +#' Internal function to check embedding validity +#' +#' This function verifies whether a Flair sentence object has been properly +#' embedded through the Flair framework by checking both the existence and +#' validity of embeddings. +#' +#' @param sentence A Flair sentence object to check +#' @return Logical indicating whether the sentence has valid embeddings +#' +#' @import flaiR +#' @noRd +.has_valid_embeddings <- function(sentence) { + # Input validation + if (is.null(sentence) || is.null(sentence$tokens)) { + return(FALSE) + } + + # Check if tokens exist and is not empty + if (length(sentence$tokens) == 0 || identical(sentence$tokens, numeric(0))) { + return(FALSE) + } + + # Safely get first token + first_token <- try(sentence$tokens[[1]], silent = TRUE) + if (inherits(first_token, "try-error") || + identical(first_token, numeric(0))) { + return(FALSE) + } + + # Check embedding structure exists + if (is.null(first_token$embedding)) { + return(FALSE) + } + + # Get numpy representation + embedding_array <- try(first_token$embedding$numpy(), silent = TRUE) + + # Check if embedding is valid (not numeric(0) and has values) + if (inherits(embedding_array, "try-error") || + identical(embedding_array, numeric(0)) || + length(embedding_array) == 0) { + return(FALSE) + } + + return(TRUE) +} diff --git a/README.Rmd b/README.Rmd index 80e41c73..169c1eb4 100644 --- a/README.Rmd +++ b/README.Rmd @@ -50,8 +50,13 @@ install.packages("remotes") remotes::install_github("davidycliao/flaiR", force = TRUE) ``` -```{r} +```{r echo=TRUE, eval=FALSE} +library(flaiR) +``` + +``` r library(flaiR) +#> flaiR: An R Wrapper for Accessing Flair NLP 0.13.1 ```
diff --git a/README.md b/README.md index 7fb284ae..00da0179 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,11 @@ remotes::install_github("davidycliao/flaiR", force = TRUE) ``` r library(flaiR) -#> flaiR: An R Wrapper for Accessing Flair NLP 0.14.0 +``` + +``` r +library(flaiR) +#> flaiR: An R Wrapper for Accessing Flair NLP 0.13.1 ```
diff --git a/_pkgdown.yml b/_pkgdown.yml index 049e4a32..d3923e99 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -204,22 +204,22 @@ reference: # - flair_models()$ClusteringModel # - flair_models()$MultitaskModel -- title: Expanded Feats - desc:
Extended functionality for working with three major functions to extract features in data.table.
-- contents: - - load_tagger_ner - - load_tagger_pos - - load_tagger_sentiments - - get_entities - - get_entities_batch - - get_pos - - get_pos_batch - - get_sentiments - - get_sentiments_batch +# - title: Expanded Feats +# desc:
Extended functionality for working with three major functions to extract features in data.table.
+# - contents: +# - load_tagger_ner +# - load_tagger_pos +# - load_tagger_sentiments +# - get_entities +# - get_pos +# - get_sentiments +# - get_sentiments_batch -- title: Toolbox +- title: FlaiR Toolbox desc:
flairR utilities
- contents: + - predict_label + - process_embeddings - flair_device - highlight_text - map_entities @@ -228,6 +228,17 @@ reference: - show_flair_cache - uninstall_python_package - install_python_package + - get_tagger_tags + - load_tagger_ner + - load_tagger_pos + - get_entities + - get_pos + + + + + + - title: Training Dataset desc:
Training dataset for tutorial
@@ -285,27 +296,28 @@ navbar: href: articles/tutorial.html#sequence-taggings - text: "Performing NER Tasks" href: articles/tutorial.html#performing-ner-tasks - - text: "Flair Embeddings" - href: articles/tutorial.html#flair-embedding + - text: "Embeddings" + href: articles/tutorial.html#embedding - text: "Training a Binary Classifier" href: articles/tutorial.html#training-a-binary-classifier - text: "Training RNNs" href: articles/tutorial.html#training-rnns - text: "Finetune Transformers " href: articles/tutorial.html#finetune-transformers + - text: "Extending conText" + href: articles/tutorial.html#extending-contexts-embedding-regression - - icon: fa-newspaper-o - text: Expanded Feats - menu: - - text: "Part-of-speech Tagging" - href: articles/get_pos.html - - text: "Named Entity Recognition" - href: articles/get_entities.html - - text: "Tagging Sentiment" - href: articles/get_sentiments.html - - text: "The Coloring Entities" - href: articles/highlight_text.html - + # - icon: fa-newspaper-o + # text: Expanded Feats + # menu: + # - text: "Part-of-speech Tagging" + # href: articles/get_pos.html + # - text: "Named Entity Recognition" + # href: articles/get_entities.html + # - text: "Tagging Sentiment" + # href: articles/get_sentiments.html + # - text: "The Coloring Entities" + # href: articles/highlight_text.html - icon: fa-file-code-o text: Reference diff --git a/man/check_tagger.Rd b/man/check_tagger.Rd new file mode 100644 index 00000000..1fcb6b81 --- /dev/null +++ b/man/check_tagger.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_entities.R +\name{check_tagger} +\alias{check_tagger} +\title{Check if Tagger is Valid} +\usage{ +check_tagger(tagger) +} +\arguments{ +\item{tagger}{A Flair tagger object to check} +} +\value{ +Logical indicating if the tagger is valid +} +\description{ +Internal function to verify if the provided tagger is valid +and has the required methods. +} +\keyword{internal} diff --git a/man/get_entities.Rd b/man/get_entities.Rd index 8edb583c..08fee104 100644 --- a/man/get_entities.Rd +++ b/man/get_entities.Rd @@ -2,15 +2,17 @@ % Please edit documentation in R/get_entities.R \name{get_entities} \alias{get_entities} -\title{Tagging Named Entities with Flair Models} +\title{Extract Named Entities from Texts with Batch Processing} \usage{ get_entities( texts, doc_ids = NULL, - tagger = NULL, - language = NULL, + tagger, show.text_id = FALSE, - gc.active = FALSE + gc.active = FALSE, + batch_size = 5, + device = "cpu", + verbose = FALSE ) } \arguments{ @@ -19,69 +21,125 @@ get_entities( \item{doc_ids}{A character or numeric vector containing the document IDs corresponding to each text.} -\item{tagger}{An optional tagger object. If NULL (default), the function will -load a Flair tagger based on the specified language.} - -\item{language}{A character string indicating the language model to load. -Default is "en".} +\item{tagger}{A Flair tagger object for named entity recognition. Must be provided +by the user. Can be created using load_tagger_ner() with different models: +\itemize{ +\item Standard NER: tagger_ner <- load_tagger_ner('ner') +\item OntoNotes: tagger_ner <- load_tagger_ner('flair/ner-english-ontonotes') +\item Large model: tagger_ner <- load_tagger_ner('flair/ner-english-large') +}} \item{show.text_id}{A logical value. If TRUE, includes the actual text from -which the entity was extracted in the resulting data table. Useful for -verification and traceability purposes but might increase the size of -the output. Default is FALSE.} +which the entity was extracted. Default is FALSE.} \item{gc.active}{A logical value. If TRUE, runs the garbage collector after -processing all texts. This can help in freeing up memory by releasing unused -memory space, especially when processing a large number of texts. -Default is FALSE.} +processing texts. Default is FALSE.} + +\item{batch_size}{An integer specifying the size of each batch. Set to 1 for +single-text processing. Default is 5.} + +\item{device}{A character string specifying the computation device ("cpu", +"cuda:0", "cuda:1", etc.). Default is "cpu". Note: MPS (Mac M1/M2) is currently +not fully supported and will default to CPU.} + +\item{verbose}{A logical value. If TRUE, prints processing progress. Default is FALSE.} } \value{ A data table with columns: \describe{ -\item{doc_id}{The ID of the document from which the entity was extracted.} -\item{text_id}{If TRUE, the actual text from which the entity -was extracted.} -\item{entity}{The named entity that was extracted from the text.} -\item{tag}{The tag or category of the named entity. Common tags include: -PERSON (names of individuals), -ORG (organizations, institutions), -GPE (countries, cities, states), -LOCATION (mountain ranges, bodies of water), -DATE (dates or periods), -TIME (times of day), -MONEY (monetary values), -PERCENT (percentage values), -FACILITY (buildings, airports), -PRODUCT (objects, vehicles), -EVENT (named events like wars or sports events), -ART (titles of books)}} +\item{doc_id}{Character or numeric. The ID of the document from which the +entity was extracted.} +\item{text_id}{Character. The complete text from which the entity was +extracted. Only included when show.text_id = TRUE.} +\item{entity}{Character. The actual named entity text that was extracted. +Will be NA if no entity was found.} +\item{tag}{Character. The category of the named entity. Available tags depend on +the model used: +\itemize{ +\item{Standard NER tags:} +\itemize{ +\item{PERSON: Names of people} +\item{ORG: Organizations} +\item{LOC: Locations} +\item{MISC: Miscellaneous entities} +} +\item{OntoNotes tags:} +\itemize{ +\item{PERSON: People, including fictional characters} +\item{ORG: Companies, agencies, institutions} +\item{GPE: Countries, cities, states} +\item{LOC: Non-GPE locations, mountains, water bodies} +\item{DATE: Absolute or relative dates} +\item{TIME: Times of day} +\item{MONEY: Monetary values} +\item{PERCENT: Percentage values} +\item{CARDINAL: Numerals} +\item{ORDINAL: Ordinal numbers} +\item{NORP: Nationalities, religious, or political groups} +\item{FAC: Buildings, airports, highways, bridges} +\item{WORK_OF_ART: Titles of books, songs, etc.} +\item{LAW: Named documents made into laws} +\item{LANGUAGE: Named languages} +} +}} +} } \description{ -This function takes texts and their corresponding document IDs -as inputs, uses the Flair NLP library to extract named entities, -and returns a dataframe of the identified entities along with their tags. -When no entities are detected in a text, the function returns a data table -with NA values. This might clutter the results. Depending on your use case, -you might decide to either keep this behavior or skip rows with no detected -entities. +This function processes texts in batches and extracts named entities +using the Flair NLP library. It supports both standard NER and OntoNotes models, +with options for batch processing and GPU acceleration. +} +\section{Tag Format}{ + +All tags use the BIOES (Begin, Inside, Outside, End, Single) scheme: +\itemize{ +\item{B-: Beginning of multi-token entity (e.g., B-PERSON in "John Smith")} +\item{I-: Inside of multi-token entity} +\item{O: Outside (not part of any entity)} +\item{E-: End of multi-token entity} +\item{S-: Single token entity (e.g., S-LOC in "Paris")} +} } + \examples{ \dontrun{ library(reticulate) library(fliaR) -texts <- c("UCD is one of the best universities in Ireland.", - "UCD has a good campus but is very far from - my apartment in Dublin.", - "Essex is famous for social science research.", - "Essex is not in the Russell Group, but it is - famous for political science research.", - "TCD is the oldest university in Ireland.", - "TCD is similar to Oxford.") -doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") -# Load NER ("ner") model -tagger_ner <- load_tagger_ner('ner') -results <- get_entities(texts, doc_ids, tagger_ner) -print(results)} +# Using standard NER model +tagger_std <- load_tagger_ner('ner') + +# Using OntoNotes model +tagger_onto <- load_tagger_ner('flair/ner-english-ontonotes') + +texts <- c( + "John Smith works at Google in New York.", + "The Eiffel Tower was built in 1889." +) +doc_ids <- c("doc1", "doc2") + +# Process with standard NER +results_std <- get_entities( + texts = texts, + doc_ids = doc_ids, + tagger = tagger_std, + batch_size = 2, + verbose = TRUE +) + +# Process with OntoNotes model +results_onto <- get_entities( + texts = texts, + doc_ids = doc_ids, + tagger = tagger_onto, + batch_size = 2, + verbose = TRUE +) + +# Filter specific entity types +persons <- results_onto[grepl("PERSON", tag)] +locations <- results_onto[grepl("LOC|GPE", tag)] +dates <- results_onto[grepl("DATE", tag)] +} } diff --git a/man/get_entities_batch.Rd b/man/get_entities_batch.Rd deleted file mode 100644 index e7bfe728..00000000 --- a/man/get_entities_batch.Rd +++ /dev/null @@ -1,76 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_entities_batch.R -\name{get_entities_batch} -\alias{get_entities_batch} -\title{Extract Named Entities from a Batch of Texts} -\usage{ -get_entities_batch( - texts, - doc_ids, - tagger = NULL, - language = "en", - show.text_id = FALSE, - gc.active = FALSE, - batch_size = 5, - device = "cpu", - verbose = TRUE -) -} -\arguments{ -\item{texts}{A character vector of texts to process.} - -\item{doc_ids}{A vector of document IDs corresponding to each text.} - -\item{tagger}{A pre-loaded Flair NER tagger. Default is NULL, and the tagger is loaded based on the provided language.} - -\item{language}{A character string specifying the language of the texts. Default is "en" (English).} - -\item{show.text_id}{Logical, whether to include the text ID in the output. Default is FALSE.} - -\item{gc.active}{Logical, whether to activate garbage collection after processing each batch. Default is FALSE.} - -\item{batch_size}{An integer specifying the size of each batch. Default is 5.} - -\item{device}{A character string specifying the computation device. -It can be either "cpu" or a string representation of a GPU device number. -For instance, "0" corresponds to the first GPU. If a GPU device number -is provided, it will attempt to use that GPU. The default is "cpu". -\itemize{ -\item{"cuda" or "cuda:0" ("mps" or "mps:0" in Mac M1/M2 )}{Refers to the first GPU in the system. If -there's only one GPU, specifying "cuda" or "cuda:0" will allocate -computations to this GPU.} -\item{"cuda:1" ("mps:1")}{Refers to the second GPU in the system, allowing allocation -of specific computations to this GPU.} -\item{"cuda:2" ("mps:2)}{Refers to the third GPU in the system, and so on for systems -with more GPUs.} -}} - -\item{verbose}{A logical value. If TRUE, the function prints batch processing -progress updates. Default is TRUE.} -} -\value{ -A data.table containing the extracted entities, their corresponding -tags, and document IDs. -} -\description{ -This function processes batches of texts and extracts named entities. -} -\examples{ -\dontrun{ -library(reticulate) -library(fliaR) - -texts <- c("UCD is one of the best universities in Ireland.", - "UCD has a good campus but is very far from - my apartment in Dublin.", - "Essex is famous for social science research.", - "Essex is not in the Russell Group, but it is - famous for political science research.", - "TCD is the oldest university in Ireland.", - "TCD is similar to Oxford.") -doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") -# Load NER ("ner") model -tagger_ner <- load_tagger_ner('ner') -results <- get_entities_batch(texts, doc_ids, tagger_ner) -print(results)} -} diff --git a/man/get_pos_batch.Rd b/man/get_pos_batch.Rd deleted file mode 100644 index 36384050..00000000 --- a/man/get_pos_batch.Rd +++ /dev/null @@ -1,86 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_pos_batch.R -\name{get_pos_batch} -\alias{get_pos_batch} -\title{Batch Process of Part-of-Speech Tagging} -\usage{ -get_pos_batch( - texts, - doc_ids, - tagger = NULL, - language = NULL, - show.text_id = FALSE, - gc.active = FALSE, - batch_size = 5, - device = "cpu", - verbose = TRUE -) -} -\arguments{ -\item{texts}{A character vector containing texts to be processed.} - -\item{doc_ids}{A character vector containing document ids.} - -\item{tagger}{A tagger object (default is NULL).} - -\item{language}{The language of the texts (default is NULL).} - -\item{show.text_id}{A logical value. If TRUE, includes the actual text from -which the entity was extracted in the resulting data table. Useful for -verification and traceability purposes but might increase the size of -the output. Default is FALSE.} - -\item{gc.active}{A logical value. If TRUE, runs the garbage collector after -processing all texts. This can help in freeing up memory by releasing unused -memory space, especially when processing a large number of texts. -Default is FALSE.} - -\item{batch_size}{An integer specifying the size of each batch. Default is 5.} - -\item{device}{A character string specifying the computation device. -\itemize{ -\item{"cuda" or "cuda:0" ("mps" or "mps:0" in Mac M1/M2 )}{Refers to the first GPU in the system. If -there's only one GPU, specifying "cuda" or "cuda:0" will allocate -computations to this GPU.} -\item{"cuda:1" ("mps:1")}{Refers to the second GPU in the system, allowing allocation -of specific computations to this GPU.} -\item{"cuda:2" ("mps:2)}{Refers to the third GPU in the system, and so on for systems -with more GPUs.} -}} - -\item{verbose}{A logical value. If TRUE, the function prints batch processing -progress updates. Default is TRUE.} -} -\value{ -A data.table containing the following columns: -\describe{ -\item{\code{doc_id}}{The document identifier corresponding to each text.} -\item{\code{token_id}}{The token number in the original text, -indicating the position of the token.} -\item{\code{text_id}}{The actual text input passed to the function (if show.text_id is TRUE).} -\item{\code{token}}{The individual word or token from the text that was -POS tagged.} -\item{\code{tag}}{The part-of-speech tag assigned to the token by -the Flair library.} -\item{\code{precision}}{A confidence score (numeric) for the -assigned POS tag.} -} -} -\description{ -This function returns a data table of POS tags and other related -data for the given texts using batch processing. -} -\examples{ -\dontrun{ -library(reticulate) -library(fliaR) -tagger_pos_fast <- load_tagger_pos('pos-fast') -texts <- c("UCD is one of the best universities in Ireland.", - "Essex is not in the Russell Group, but it is famous for political science research.", - "TCD is the oldest university in Ireland.") -doc_ids <- c("doc1", "doc2", "doc3") - -# Using the batch_size parameter -get_pos_batch(texts, doc_ids, tagger_pos_fast, batch_size = 2) -} -} diff --git a/man/get_sentiments.Rd b/man/get_sentiments.Rd deleted file mode 100644 index 99793dc1..00000000 --- a/man/get_sentiments.Rd +++ /dev/null @@ -1,71 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_sentiments.R -\name{get_sentiments} -\alias{get_sentiments} -\title{Tagging Sentiment with Flair Standard Models} -\usage{ -get_sentiments( - texts, - doc_ids, - tagger = NULL, - ..., - language = NULL, - show.text_id = FALSE, - gc.active = FALSE -) -} -\arguments{ -\item{texts}{A list or vector of texts for which sentiment prediction is -to be made.} - -\item{doc_ids}{A list or vector of document IDs corresponding to the texts.} - -\item{tagger}{An optional flair sentiment model. If NULL (default), -the function loads the default model based on the language.} - -\item{...}{Additional arguments passed to next.} - -\item{language}{A character string indicating the language of the texts. -Currently supports "sentiment" (English), "sentiment-fast" (English), and -"de-offensive-language" (German)} - -\item{show.text_id}{A logical value. If TRUE, includes the actual text from -which the sentiment was predicted. Default is FALSE.} - -\item{gc.active}{A logical value. If TRUE, runs the garbage collector after -processing all texts. This can help in freeing up memory by releasing unused -memory space, especially when processing a large number of texts. -Default is FALSE.} -} -\value{ -A \code{data.table} containing three columns: -\itemize{ -\item \code{doc_id}: The document ID from the input. -\item \code{sentiment}: Predicted sentiment for the text. -\item \code{score}: Score for the sentiment prediction. -} -} -\description{ -This function takes in texts and their associated document IDs -to predict sentiments using the flair Python library. -} -\examples{ -\dontrun{ -library(flaiR) -texts <- c("UCD is one of the best universities in Ireland.", - "UCD has a good campus but is very far from my apartment in Dublin.", - "Essex is famous for social science research.", - "Essex is not in the Russell Group, but it is famous for political science research.", - "TCD is the oldest university in Ireland.", - "TCD is similar to Oxford.") - -doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") - -# Load re-trained sentiment ("sentiment") model -tagger_sent <- load_tagger_sentiments('sentiment') - -results <- get_sentiments(texts, doc_ids, tagger_sent) -print(results) -} - -} diff --git a/man/get_sentiments_batch.Rd b/man/get_sentiments_batch.Rd deleted file mode 100644 index a19504d7..00000000 --- a/man/get_sentiments_batch.Rd +++ /dev/null @@ -1,97 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_sentiments_batch.R -\name{get_sentiments_batch} -\alias{get_sentiments_batch} -\title{Batch Process of Tagging Sentiment with Flair Models} -\usage{ -get_sentiments_batch( - texts, - doc_ids, - tagger = NULL, - ..., - language = NULL, - show.text_id = FALSE, - gc.active = FALSE, - batch_size = 5, - device = "cpu", - verbose = FALSE -) -} -\arguments{ -\item{texts}{A list or vector of texts for which sentiment prediction is -to be made.} - -\item{doc_ids}{A list or vector of document IDs corresponding to the texts.} - -\item{tagger}{An optional flair sentiment model. If NULL (default), -the function loads the default model based on the language.} - -\item{...}{Additional arguments passed to next.} - -\item{language}{A character string indicating the language of the texts. -Currently supports "sentiment" (English), "sentiment-fast" (English), and -"de-offensive-language" (German)} - -\item{show.text_id}{A logical value. If TRUE, includes the actual text from -which the sentiment was predicted. Default is FALSE.} - -\item{gc.active}{A logical value. If TRUE, runs the garbage collector after -processing all texts. This can help in freeing up memory by releasing unused -memory space, especially when processing a large number of texts. -Default is FALSE.} - -\item{batch_size}{An integer specifying the number of texts to be processed -at once. It can help optimize performance by leveraging parallel processing. -Default is 5.} - -\item{device}{A character string specifying the computation device. -It can be either "cpu" or a string representation of a GPU device number. -For instance, "0" corresponds to the first GPU. If a GPU device number -is provided, it will attempt to use that GPU. The default is "cpu". -\itemize{ -\item{"cuda" or "cuda:0" ("mps" or "mps:0" in Mac M1/M2 )}{Refers to the first GPU in the system. If -there's only one GPU, specifying "cuda" or "cuda:0" will allocate -computations to this GPU.} -\item{"cuda:1" ("mps:1")}{Refers to the second GPU in the system, allowing allocation -of specific computations to this GPU.} -\item{"cuda:2" ("mps:2)}{Refers to the third GPU in the system, and so on for systems -with more GPUs.} -}} - -\item{verbose}{A logical value. If TRUE, the function prints batch processing -progress updates. Default is TRUE.} -} -\value{ -A \code{data.table} containing three columns: -\itemize{ -\item \code{doc_id}: The document ID from the input. -\item \code{sentiment}: Predicted sentiment for the text. -\item \code{score}: Score for the sentiment prediction. -} -} -\description{ -This function takes in texts and their associated document IDs -to predict sentiments using the flair Python library. -} -\examples{ -\dontrun{ -library(flaiR) - - -texts <- c("UCD is one of the best universities in Ireland.", - "UCD has a good campus but is very far from my apartment in Dublin.", - "Essex is famous for social science research.", - "Essex is not in the Russell Group, but it is famous for political science research.", - "TCD is the oldest university in Ireland.", - "TCD is similar to Oxford.") - -doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") - -# Load re-trained sentiment ("sentiment") model -tagger_sent <- load_tagger_sentiments('sentiment') - -results <- get_sentiments_batch(texts, doc_ids, tagger_sent, batch_size = 3) -print(results) -} - -} diff --git a/man/get_tagger_tags.Rd b/man/get_tagger_tags.Rd new file mode 100644 index 00000000..b4189d38 --- /dev/null +++ b/man/get_tagger_tags.Rd @@ -0,0 +1,65 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/flair_loaders.R +\name{get_tagger_tags} +\alias{get_tagger_tags} +\title{Extract Model Tags} +\usage{ +get_tagger_tags(tagger) +} +\arguments{ +\item{tagger}{A loaded Flair SequenceTagger model} +} +\value{ +A list of tags grouped by category: +\describe{ +\item{all}{Complete list of all available tags} +\item{special}{Special tags like \if{html}{\out{}}, O, \if{html}{\out{}}, \if{html}{\out{}}} +\item{person}{Person-related tags (e.g., B-PER, I-PER)} +\item{organization}{Organization tags (e.g., B-ORG, E-ORG)} +\item{location}{Location tags (e.g., B-LOC, S-LOC)} +\item{misc}{Miscellaneous entity tags} +} +} +\description{ +Helper function to extract and categorize tags from a loaded Flair +SequenceTagger model. The tags are grouped into categories such as person, +organization, location, and miscellaneous. +} +\details{ +The tags follow the BIOES (Begin, Inside, Outside, End, Single) scheme: +\itemize{ +\item{B-: Beginning of multi-token entity (e.g., B-PER in "John Smith")} +\item{I-: Inside of multi-token entity (e.g., I-PER in "John Smith")} +\item{O: Outside of any entity} +\item{E-: End of multi-token entity} +\item{S-: Single token entity (e.g., S-LOC in "Paris")} +} +} +\examples{ +\dontrun{ +# Load a NER model +tagger <- load_tagger_ner("flair/ner-english-large") + +# Extract all tags +tags <- get_tagger_tags(tagger) + +# Access specific tag categories +print(tags$person) # All person-related tags +print(tags$location) # All location-related tags + +# Example usage with text annotation +# B-PER I-PER O S-ORG +# "John Smith works at Google" + +# B-LOC E-LOC O B-ORG E-ORG +# "New York is United Nations headquarters" + +# Use tags to filter entities +person_entities <- results[tag \%in\% tags$person] +org_entities <- results[tag \%in\% tags$organization] +} + +} +\seealso{ +\code{\link{load_tagger_ner}} for loading the NER model +} diff --git a/man/load_tagger_ner.Rd b/man/load_tagger_ner.Rd index ac1c447e..27ff5128 100644 --- a/man/load_tagger_ner.Rd +++ b/man/load_tagger_ner.Rd @@ -1,39 +1,22 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/flaiR_loader.R +% Please edit documentation in R/flair_loaders.R \name{load_tagger_ner} \alias{load_tagger_ner} -\title{Load the Named Entity Recognition (NER) Tagger} +\title{Load and Configure NER Tagger} \usage{ -load_tagger_ner(language = NULL) +load_tagger_ner(model_name = "ner", show_tags = TRUE) } \arguments{ -\item{language}{A character string indicating the desired language for the NER tagger. -If \code{NULL}, the function will default to the 'pos-fast' model. -Supported languages and their models include: -\itemize{ -\item \code{"en"} - English NER tagging (\code{ner}) -\item \code{"de"} - German NER tagging (\code{de-ner}) -\item \code{"fr"} - French NER tagging (\code{fr-ner}) -\item \code{"nl"} - Dutch NER tagging (\code{nl-ner}) -\item \code{"da"} - Danish NER tagging (\code{da-ner}) -\item \code{"ar"} - Arabic NER tagging (\code{ar-ner}) -\item \code{"ner-fast"} - English NER fast model (\code{ner-fast}) -\item \code{"ner-large"} - English NER large mode (\code{ner-large}) -\item \code{"de-ner-legal"} - NER (legal text) (\code{de-ner-legal}) -\item \code{"nl"} - Dutch NER tagging (\code{nl-ner}) -\item \code{"da"} - Danish NER tagging (\code{da-ner}) -\item \code{"ar"} - Arabic NER tagging (\code{ar-ner}) -}} +\item{model_name}{Character string specifying the model to load. +Can be "ner" (default), "flair/ner-english-large", or "flair/ner-english-ontonotes"} + +\item{show_tags}{Logical, whether to display the tag dictionary. +Default is TRUE.} } \value{ -An instance of the Flair SequenceTagger for the specified language. +A Flair SequenceTagger model object } \description{ -A helper function to load the appropriate tagger based on the provided language. -This function supports a variety of languages/models. -} -\examples{ -# Load the English NER tagger -tagger_en <- load_tagger_ner("en") - +Loads a Named Entity Recognition model from Flair and displays +its tag dictionary. Supports both standard NER and OntoNotes models. } diff --git a/man/load_tagger_pos.Rd b/man/load_tagger_pos.Rd index fe582c4d..bc2fcfb1 100644 --- a/man/load_tagger_pos.Rd +++ b/man/load_tagger_pos.Rd @@ -1,12 +1,18 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/flaiR_loader.R +% Please edit documentation in R/flair_loaders.R \name{load_tagger_pos} \alias{load_tagger_pos} \title{Load Flair POS Tagger} \usage{ -load_tagger_pos(language = NULL) +load_tagger_pos(model_name = "pos-fast", show_tags = TRUE) } \arguments{ +\item{model_name}{Character string specifying the model to load. +Default is "pos-fast".} + +\item{show_tags}{Logical, whether to display the tag dictionary. +Default is TRUE.} + \item{language}{A character string indicating the desired language model. If \code{NULL}, the function will default to the 'pos-fast' model. Supported language models include: \itemize{ @@ -28,10 +34,15 @@ the function will default to the 'pos-fast' model. Supported language models inc } \value{ A Flair POS tagger model corresponding to the specified (or default) language. + +A Flair tagger model object for POS tagging } \description{ This function loads the POS (part-of-speech) tagger model for a specified language using the Flair library. If no language is specified, it defaults to 'pos-fast'. + +Loads a Part-of-Speech tagging model from Flair and displays +its tag dictionary in organized categories. } \examples{ \dontrun{ diff --git a/man/load_tagger_sentiments.Rd b/man/load_tagger_sentiments.Rd deleted file mode 100644 index 8b07c18a..00000000 --- a/man/load_tagger_sentiments.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/flaiR_loader.R -\name{load_tagger_sentiments} -\alias{load_tagger_sentiments} -\title{Load a Sentiment or Language Tagger Model from Flair} -\usage{ -load_tagger_sentiments(language = NULL) -} -\arguments{ -\item{language}{A character string specifying the language model to load. -Supported models include: -\itemize{ -\item "sentiment" - Sentiment analysis model -\item "sentiment-fast" - Faster sentiment analysis model -\item "de-offensive-language" - German offensive language detection model -} If not provided, the function will default to the "sentiment" model.} -} -\value{ -An object of the loaded Flair model. -} -\description{ -This function loads a pre-trained sentiment or language tagger -from the Flair library. -} -\examples{ -\dontrun{ - tagger <- load_tagger_sentiments("sentiment") -} - -} diff --git a/man/predict_label.Rd b/man/predict_label.Rd new file mode 100644 index 00000000..76632910 --- /dev/null +++ b/man/predict_label.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/predict_label.R +\name{predict_label} +\alias{predict_label} +\title{Predict Text Label Using Flair Classifier} +\usage{ +predict_label(text, classifier, threshold_score = NULL, threshold = FALSE) +} +\arguments{ +\item{text}{A character string containing the text to be labeled} + +\item{classifier}{A Flair TextClassifier object for making predictions} + +\item{threshold_score}{Numeric value between 0 and 1 representing the confidence +threshold for label classification. Defaults to 0.5 if not specified} + +\item{threshold}{Logical indicating whether to apply the threshold_score to +adjust predictions to NEUTRAL. Defaults to FALSE} +} +\value{ +A list containing the following elements: +\describe{ +\item{label}{Character string of the final predicted label (AGAINST/FAVOR/NEUTRAL)} +\item{score}{Numeric confidence score from the classifier} +\item{token_number}{Integer count of tokens in the input text} +\item{threshold_score}{Numeric value of the threshold used} +\item{original_label}{Character string of the classifier's original prediction +before thresholding} +} +} +\description{ +This function predicts the label of input text using a Flair classifier, +with options for confidence thresholding to adjust predictions to NEUTRAL. +} +\details{ +The function will throw an error if the classifier is NULL or not a +Flair TextClassifier. +} +\examples{ +\dontrun{ +# Load a pre-trained classifier +classifier <- flair$models$TextClassifier$load('stance-classifier') + +# Predict label without thresholding +result1 <- predict_label( + text = "I strongly support this policy", + classifier = classifier +) + +# Predict with custom threshold +result2 <- predict_label( + text = "I somewhat agree with the proposal", + classifier = classifier, + threshold_score = 0.7, + threshold = TRUE +) +} + +} diff --git a/man/process_embeddings.Rd b/man/process_embeddings.Rd new file mode 100644 index 00000000..353cb04a --- /dev/null +++ b/man/process_embeddings.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/process_embeddings.R +\name{process_embeddings} +\alias{process_embeddings} +\title{Process Token Embeddings from Flair Sentence Object} +\usage{ +process_embeddings(sentence, verbose = FALSE) +} +\arguments{ +\item{sentence}{A Flair sentence object containing tokens with embeddings. +The sentence object should have a 'tokens' attribute, where each token +has both an 'embedding' (with numpy() method) and 'text' attribute.} + +\item{verbose}{Logical indicating whether to print progress messages. Default is FALSE.} +} +\value{ +A matrix where: +\itemize{ +\item Each row represents a token's embedding +\item Row names are the corresponding token texts +\item Columns represent the dimensions of the embedding vectors +} +} +\description{ +This function processes token embeddings from a Flair sentence object and converts them +into a matrix format with token names as row names. It handles the extraction of +embeddings from tokens, retrieval of token texts, and conversion to matrix format. +} +\details{ +The function will throw errors in the following cases: +\itemize{ +\item If sentence is NULL or has no tokens +\item If any token is missing an embedding +\item If any token is missing text +} +} +\examples{ +\dontrun{ +# Create a Flair sentence +sentence <- Sentence("example text") +WordEmbeddings <- flair_embeddings()$WordEmbeddings + +# Initialize FastText embeddings trained on Common Crawl +fasttext_embeddings <- WordEmbeddings('en-crawl') + +# Apply embeddings +fasttext_embeddings$embed(sentence) + +# Process embeddings with timing and messages +embedding_matrix <- process_embeddings(sentence, verbose = TRUE) +} + +} diff --git a/renv.lock b/renv.lock index 4fd5d90a..29f8c376 100644 --- a/renv.lock +++ b/renv.lock @@ -11,9 +11,9 @@ "Packages": { "Matrix": { "Package": "Matrix", - "Version": "1.6-5", + "Version": "1.7-1", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "R", "grDevices", @@ -24,18 +24,18 @@ "stats", "utils" ], - "Hash": "8c7115cd3a0e048bda2a7cd110549f7a" + "Hash": "5122bb14d8736372411f955e1b16bc8a" }, "Rcpp": { "Package": "Rcpp", - "Version": "1.0.13", + "Version": "1.0.13-1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "methods", "utils" ], - "Hash": "f27411eb6d9c3dada5edd444b8416675" + "Hash": "6b868847b365672d6c1677b1608da9ed" }, "RcppTOML": { "Package": "RcppTOML", @@ -81,24 +81,24 @@ }, "curl": { "Package": "curl", - "Version": "5.2.2", + "Version": "6.0.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R" ], - "Hash": "8f27335f2bcff4d6035edcc82d7d46de" + "Hash": "e8ba62486230951fcd2b881c5be23f96" }, "data.table": { "Package": "data.table", - "Version": "1.16.0", + "Version": "1.16.4", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", "methods" ], - "Hash": "fb24e05d4a91d8b1c7ff8e284bde834a" + "Hash": "38bbf05fc2503143db4c734a7e5cab66" }, "digest": { "Package": "digest", @@ -120,20 +120,20 @@ }, "glue": { "Package": "glue", - "Version": "1.7.0", + "Version": "1.8.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", "methods" ], - "Hash": "e0b3a53876554bd45879e596cdb10a52" + "Hash": "5899f1eaa825580172bb56c08266f37c" }, "here": { "Package": "here", "Version": "1.0.1", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "rprojroot" ], @@ -157,13 +157,13 @@ }, "jsonlite": { "Package": "jsonlite", - "Version": "1.8.8", + "Version": "1.8.9", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "methods" ], - "Hash": "e1b9c55281c5adc4dd113652d9e26768" + "Hash": "4e993b65c2c3ffbffce7bb3e2c6f832b" }, "lattice": { "Package": "lattice", @@ -207,7 +207,7 @@ "Package": "png", "Version": "0.1-8", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "R" ], @@ -225,17 +225,17 @@ }, "renv": { "Package": "renv", - "Version": "1.0.7", + "Version": "1.0.11", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "utils" ], - "Hash": "397b7b2a265bc5a7a06852524dabae20" + "Hash": "47623f66b4e80b3b0587bc5d7b309888" }, "reticulate": { "Package": "reticulate", - "Version": "1.39.0", + "Version": "1.40.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -253,7 +253,7 @@ "utils", "withr" ], - "Hash": "e1a5d04397edc1580c5e0ed1dbdccf76" + "Hash": "04740f615607c4e6099356ff6d6694ee" }, "rlang": { "Package": "rlang", @@ -270,7 +270,7 @@ "Package": "rprojroot", "Version": "2.0.4", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "R" ], @@ -322,7 +322,7 @@ }, "withr": { "Package": "withr", - "Version": "3.0.1", + "Version": "3.0.2", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -330,7 +330,7 @@ "grDevices", "graphics" ], - "Hash": "07909200e8bbe90426fbfeb73e1e27aa" + "Hash": "cc2d62c76458d425210d1eb1478b30b4" } } } diff --git a/renv/activate.R b/renv/activate.R index d13f9932..0eb51088 100644 --- a/renv/activate.R +++ b/renv/activate.R @@ -2,7 +2,7 @@ local({ # the requested version of renv - version <- "1.0.7" + version <- "1.0.11" attr(version, "sha") <- NULL # the project directory @@ -98,6 +98,66 @@ local({ unloadNamespace("renv") # load bootstrap tools + ansify <- function(text) { + if (renv_ansify_enabled()) + renv_ansify_enhanced(text) + else + renv_ansify_default(text) + } + + renv_ansify_enabled <- function() { + + override <- Sys.getenv("RENV_ANSIFY_ENABLED", unset = NA) + if (!is.na(override)) + return(as.logical(override)) + + pane <- Sys.getenv("RSTUDIO_CHILD_PROCESS_PANE", unset = NA) + if (identical(pane, "build")) + return(FALSE) + + testthat <- Sys.getenv("TESTTHAT", unset = "false") + if (tolower(testthat) %in% "true") + return(FALSE) + + iderun <- Sys.getenv("R_CLI_HAS_HYPERLINK_IDE_RUN", unset = "false") + if (tolower(iderun) %in% "false") + return(FALSE) + + TRUE + + } + + renv_ansify_default <- function(text) { + text + } + + renv_ansify_enhanced <- function(text) { + + # R help links + pattern <- "`\\?(renv::(?:[^`])+)`" + replacement <- "`\033]8;;ide:help:\\1\a?\\1\033]8;;\a`" + text <- gsub(pattern, replacement, text, perl = TRUE) + + # runnable code + pattern <- "`(renv::(?:[^`])+)`" + replacement <- "`\033]8;;ide:run:\\1\a\\1\033]8;;\a`" + text <- gsub(pattern, replacement, text, perl = TRUE) + + # return ansified text + text + + } + + renv_ansify_init <- function() { + + envir <- renv_envir_self() + if (renv_ansify_enabled()) + assign("ansify", renv_ansify_enhanced, envir = envir) + else + assign("ansify", renv_ansify_default, envir = envir) + + } + `%||%` <- function(x, y) { if (is.null(x)) y else x } @@ -142,7 +202,10 @@ local({ # compute common indent indent <- regexpr("[^[:space:]]", lines) common <- min(setdiff(indent, -1L)) - leave - paste(substring(lines, common), collapse = "\n") + text <- paste(substring(lines, common), collapse = "\n") + + # substitute in ANSI links for executable renv code + ansify(text) } @@ -305,8 +368,11 @@ local({ quiet = TRUE ) - if ("headers" %in% names(formals(utils::download.file))) - args$headers <- renv_bootstrap_download_custom_headers(url) + if ("headers" %in% names(formals(utils::download.file))) { + headers <- renv_bootstrap_download_custom_headers(url) + if (length(headers) && is.character(headers)) + args$headers <- headers + } do.call(utils::download.file, args) @@ -385,10 +451,21 @@ local({ for (type in types) { for (repos in renv_bootstrap_repos()) { + # build arguments for utils::available.packages() call + args <- list(type = type, repos = repos) + + # add custom headers if available -- note that + # utils::available.packages() will pass this to download.file() + if ("headers" %in% names(formals(utils::download.file))) { + headers <- renv_bootstrap_download_custom_headers(repos) + if (length(headers) && is.character(headers)) + args$headers <- headers + } + # retrieve package database db <- tryCatch( as.data.frame( - utils::available.packages(type = type, repos = repos), + do.call(utils::available.packages, args), stringsAsFactors = FALSE ), error = identity @@ -470,6 +547,14 @@ local({ } + renv_bootstrap_github_token <- function() { + for (envvar in c("GITHUB_TOKEN", "GITHUB_PAT", "GH_TOKEN")) { + envval <- Sys.getenv(envvar, unset = NA) + if (!is.na(envval)) + return(envval) + } + } + renv_bootstrap_download_github <- function(version) { enabled <- Sys.getenv("RENV_BOOTSTRAP_FROM_GITHUB", unset = "TRUE") @@ -477,16 +562,16 @@ local({ return(FALSE) # prepare download options - pat <- Sys.getenv("GITHUB_PAT") - if (nzchar(Sys.which("curl")) && nzchar(pat)) { + token <- renv_bootstrap_github_token() + if (nzchar(Sys.which("curl")) && nzchar(token)) { fmt <- "--location --fail --header \"Authorization: token %s\"" - extra <- sprintf(fmt, pat) + extra <- sprintf(fmt, token) saved <- options("download.file.method", "download.file.extra") options(download.file.method = "curl", download.file.extra = extra) on.exit(do.call(base::options, saved), add = TRUE) - } else if (nzchar(Sys.which("wget")) && nzchar(pat)) { + } else if (nzchar(Sys.which("wget")) && nzchar(token)) { fmt <- "--header=\"Authorization: token %s\"" - extra <- sprintf(fmt, pat) + extra <- sprintf(fmt, token) saved <- options("download.file.method", "download.file.extra") options(download.file.method = "wget", download.file.extra = extra) on.exit(do.call(base::options, saved), add = TRUE) diff --git a/vignettes/get_entities.Rmd b/vignettes/get_entities.Rmd index 6a4a7745..a8b2200d 100644 --- a/vignettes/get_entities.Rmd +++ b/vignettes/get_entities.Rmd @@ -101,28 +101,3 @@ print(results) ```{r} print(results) ``` - -## Batch Processing - -
-Processing texts individually can be both inefficient and memory-intensive. On the other hand, processing all the texts simultaneously could surpass memory constraints, especially if each document in the dataset is sizable. Parsing the documents in smaller batches may provide an optimal compromise between these two scenarios. Batch processing can enhance efficiency and aid in memory management. - -By default, the batch_size parameter is set to 5. You can consider starting with this default value and then experimenting with different batch sizes to find the one that works best for your specific use case. You can monitor memory usage and processing time to help you make a decision. If you have access to a GPU, you might also try larger batch sizes to take advantage of GPU parallelism. However, be cautious not to set the batch size too large, as it can lead to out-of-memory errors. Ultimately, the choice of batch size should be based on a balance between memory constraints, processing efficiency, and the specific requirements of your entity extraction task. - -
- -```{r} -batch_process_time <- system.time({ - batch_process_results <- get_entities_batch(uk_immigration$text, - uk_immigration$speaker, - tagger_ner, - show.text_id = FALSE, - batch_size = 5) - gc() -}) -print(batch_process_time) -``` - -```{r} -print(batch_process_results) -``` diff --git a/vignettes/get_pos.Rmd b/vignettes/get_pos.Rmd index 752cf18d..0f023731 100644 --- a/vignettes/get_pos.Rmd +++ b/vignettes/get_pos.Rmd @@ -86,23 +86,3 @@ print(results) ```
-## Batch Processing -
- -By default, the batch_size parameter is set to 5. You can consider starting with this default value and then experimenting with different batch sizes to find the one that works best for your specific use case. You can monitor memory usage and processing time to help you make a decision. If you have access to a GPU, you might also try larger batch sizes to take advantage of GPU parallelism. However, be cautious not to set the batch size too large, as it can lead to out-of-memory errors. Ultimately, the choice of batch size should be based on a balance between memory constraints, processing efficiency, and the specific requirements of your entity extraction task. - -
- -```{r} -batch_process_results <- get_pos_batch(uk_immigration$text, - uk_immigration$speaker, - tagger_pos, - show.text_id = FALSE, - batch_size = 10, - verbose = TRUE) -print(batch_process_results) -``` - - - - diff --git a/vignettes/get_sentiments.Rmd b/vignettes/get_sentiments.Rmd deleted file mode 100644 index 6e74ef53..00000000 --- a/vignettes/get_sentiments.Rmd +++ /dev/null @@ -1,122 +0,0 @@ ---- -title: "Tagging Sentiment with Flair Standard Models" -author: - - name: "David (Yen-Chieh) Liao" - affiliation: "Postdoc at Text & Policy Research Group and SPIRe in UCD" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Tagging Sentiment with Flair Standard Models} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -```{r, include = FALSE} -system(paste(Sys.which("python3"), "-m pip install --upgrade pip")) -system(paste(Sys.which("python3"), "-m pip install torch")) -system(paste(Sys.which("python3"), "-m pip install flair")) -library(reticulate) -library(flaiR) -# reticulate::py_install("flair") -``` - - -## An Example Using `sentiment` Model (Pre-trained English Model) -```{r} -library(flaiR) -data("uk_immigration") -uk_immigration <- head(uk_immigration, 5) -``` - -
- -Download the English sentiment model from FlairNLP on [Hugging Face](https://huggingface.co/flair). Currently, it also supports a large English sentiment model and a German pre-trained model. - -```{r} -tagger_sent <- load_tagger_sentiments("sentiment") -``` - -Flair NLP operates under the [PyTorch](https://pytorch.org) framework. As such, we can use the `$to` method to set the device for the Flair Python library. The flair_device("cpu") allows you to select whether to use the CPU, CUDA devices (like cuda:0, cuda:1, cuda:2), or specific MPS devices on Mac (such as mps:0, mps:1, mps:2). For information on Accelerated PyTorch training on Mac, please refer to https://developer.apple.com/metal/pytorch/. For more about CUDA, please visit: https://developer.nvidia.com/cuda-zone - -```{r eval=FALSE, include=TRUE} -tagger_sent$to(flair_device("mps")) -``` - -``` -TextClassifier( - (embeddings): TransformerDocumentEmbeddings( - (model): DistilBertModel( - (embeddings): Embeddings( - (word_embeddings): Embedding(30522, 768, padding_idx=0) - (position_embeddings): Embedding(512, 768) - (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layer): ModuleList( - (0-5): 6 x TransformerBlock( - (attention): MultiHeadSelfAttention( - (dropout): Dropout(p=0.1, inplace=False) - (q_lin): Linear(in_features=768, out_features=768, bias=True) - (k_lin): Linear(in_features=768, out_features=768, bias=True) - (v_lin): Linear(in_features=768, out_features=768, bias=True) - (out_lin): Linear(in_features=768, out_features=768, bias=True) - ) - (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) - (ffn): FFN( - (dropout): Dropout(p=0.1, inplace=False) - (lin1): Linear(in_features=768, out_features=3072, bias=True) - (lin2): Linear(in_features=3072, out_features=768, bias=True) - (activation): GELUActivation() - ) - (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) - ) - ) - ) - ) - ) - (decoder): Linear(in_features=768, out_features=2, bias=True) - (dropout): Dropout(p=0.0, inplace=False) - (locked_dropout): LockedDropout(p=0.0) - (word_dropout): WordDropout(p=0.0) - (loss_function): CrossEntropyLoss() -) -``` -```{r } -results <- get_sentiments(uk_immigration$text, seq_len(nrow(uk_immigration)), - tagger_sent) -``` - -```{r} -print(results) -``` - -
- -## Batch Processing in English Sentiment Model -
- -Processing texts individually can be both inefficient and memory-intensive. On the other hand, processing all the texts simultaneously could surpass memory constraints, especially if each document in the dataset is sizable. Parsing the documents in smaller batches may provide an optimal compromise between these two scenarios. Batch processing can enhance efficiency and aid in memory management. - -By default, the batch_size parameter is set to 5. You can consider starting with this default value and then experimenting with different batch sizes to find the one that works best for your specific use case. You can monitor memory usage and processing time to help you make a decision. If you have access to a GPU, you might also try larger batch sizes to take advantage of GPU parallelism. However, be cautious not to set the batch size too large, as it can lead to out-of-memory errors. Ultimately, the choice of batch size should be based on a balance between memory constraints, processing efficiency, and the specific requirements of your entity extraction task. - -
-```{r} -batch_process_results <- get_sentiments_batch(uk_immigration$text, - uk_immigration$speaker, - tagger_sent, - show.text_id = FALSE, - batch_size = 2, - verbose = TRUE) -``` - -```{r} -print(batch_process_results) -``` - diff --git a/vignettes/test.Rmd b/vignettes/test.Rmd deleted file mode 100644 index fc228e3d..00000000 --- a/vignettes/test.Rmd +++ /dev/null @@ -1,19 +0,0 @@ ---- -title: "R Notebook" -output: html_notebook ---- - -This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. - -Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. - -```{r} -plot(cars) -``` - -Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Cmd+Option+I*. - -When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Cmd+Shift+K* to preview the HTML file). - -The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike *Knit*, *Preview* does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed. - diff --git a/vignettes/tutorial.Rmd b/vignettes/tutorial.Rmd index 6fc47791..2ead3dfa 100644 --- a/vignettes/tutorial.Rmd +++ b/vignettes/tutorial.Rmd @@ -1,7 +1,7 @@ --- title: "Tutorial" author: - - name: "Yen-Chieh (David) Liao | Sohini Timbadia | Stefan Müller" + - name: "Yen-Chieh Liao | Sohini Timbadia | Stefan Müller" affiliation: "University of Birmingham & University College Dublin" output: rmarkdown::html_vignette vignette: > @@ -51,6 +51,10 @@ In addition, Flair NLP offers pre-trained models for various languages and tasks - [**Finetune BERT**](#tutorial.html#finetune-transformers) +- [**Extending conText's Embedding Regression**](#tutorial.html#extending-contexts-embedding-regression) + + + ------------------------------------------------------------------------ @@ -66,6 +70,7 @@ Sentence and Token are fundamental classes. A Sentence in Flair is an object that contains a sequence of Token objects, and it can be annotated with labels, such as named entities, part-of-speech tags, and more. It also can store embeddings for the sentence as a whole and different kinds of linguistic annotations. Here's a simple example of how you create a Sentence: + ```{r} # Creating a Sentence object @@ -90,6 +95,7 @@ print(sentence) When you use Flair to handle text data,[^1] `Sentence` and `Token` objects often play central roles in many use cases. When you create a Sentence object, it automatically tokenizes the text, removing the need to create the Token object manually. Unlike R, which indexes from 1, Python indexes from 0. Therefore, when using a for loop, I use `seq_along(sentence) - 1`. The output should be something like: + ```{r} # The Sentence object has automatically created and contains multiple Token objects @@ -108,8 +114,11 @@ print(sentence$tokens) **Retrieve the Token** +
To comprehend the string representation format of the Sentence object, tagging at least one token is adequate. Python's `get_token(n)` method allows us to retrieve the Token object for a particular token. Additionally, we can use **`[]`** to index a specific token. +
+ ```{r} # method in Python @@ -120,16 +129,20 @@ sentence$get_token(5) # indexing in R sentence[6] ``` +
-Each word (and punctuation) in the text is treated as an individual Token object. These Token objects store text information and other possible linguistic information (such as part-of-speech tags or named entity tags) and embeddings (if you used a model to generate them). +Each word (and punctuation) in the text is treated as an individual Token object. These Token objects store text information and other possible linguistic information (such as part-of-speech tags or named entity tags) and embedding (if you used a model to generate them). While you do not need to create Token objects manually, understanding how to manage them is useful in situations where you might want to fine-tune the tokenization process. For example, you can control the exactness of tokenization by manually creating Token objects from a Sentence object. This makes Flair very flexible when handling text data since the automatic tokenization feature can be used for rapid development, while also allowing users to fine-tune their tokenization. -**Annotate POS tag and NER tag** +
+**Annotate POS tag and NER tag** +
The `add_label(label_type, value)` method can be employed to assign a label to the token. In Universal POS tags, if `sentence[10]` is 'see', 'seen' might be tagged as `VERB`, indicating it is a past participle form of a verb. +
```{r} sentence[10]$add_label('manual-pos', 'VERB') @@ -139,7 +152,9 @@ sentence[10]$add_label('manual-pos', 'VERB') print(sentence[10]) ``` +
We can also add a NER (Named Entity Recognition) tag to `sentence[4]`, "UCD", identifying it as a university in Dublin. +
```{r} sentence[4]$add_label('ner', 'ORG') @@ -149,15 +164,18 @@ sentence[4]$add_label('ner', 'ORG') print(sentence[4]) ``` +
+ If we print the sentence object, `Sentence[50]` provides information for 50 tokens → ['in'/ORG, 'seen'/VERB], thus displaying two tagging pieces of information. +
+ ```{r} print(sentence) ``` [^1]: Flair is built on PyTorch, which is a library in Python. - ## **Corpus** @@ -165,18 +183,28 @@ print(sentence) The Corpus object in Flair is a fundamental data structure that represents a dataset containing text samples, usually comprising of a training set, a development set (or validation set), and a test set. It's designed to work smoothly with Flair's models for tasks like named entity recognition, text classification, and more. + + **Attributes:** +
+ - `train`: A list of sentences (List[Sentence]) that form the training dataset. - `dev` (or development): A list of sentences (List[Sentence]) that form the development (or validation) dataset. - `test`: A list of sentences (List[Sentence]) that form the test dataset. +
+ **Important Methods:** +
+ - `downsample`: This method allows you to downsample (reduce) the number of sentences in the train, dev, and test splits. - `obtain_statistics`: This method gives a quick overview of the statistics of the corpus, including the number of sentences and the distribution of labels. - `make_vocab_dictionary`: Used to create a vocabulary dictionary from the corpus. +
+ ```{r} library(flaiR) Corpus <- flair_data()$Corpus @@ -193,8 +221,12 @@ test <- list(Sentence('This is a test example.')) corp <- Corpus(train = train, dev = dev, test = test) ``` +
+ `$obtain_statistics()` method of the Corpus object in the Flair library provides an overview of the dataset statistics. The method returns a [Python dictionary](https://www.w3schools.com/python/python_dictionaries.asp) with details about the training, validation (development), and test datasets that make up the corpus. In R, you can use the jsonlite package to format JSON. +
+ ```{r} library(jsonlite) data <- fromJSON(corp$obtain_statistics()) @@ -204,9 +236,12 @@ print(formatted_str) **In R** +
+ Below, we use data from the article [*The Temporal Focus of Campaign Communication*](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwjz_bS3p5KCAxWEWEEAHcuVAi4QFnoECA8QAQ&url=https%3A%2F%2Fwww.journals.uchicago.edu%2Fdoi%2Ffull%2F10.1086%2F715165&usg=AOvVaw3f_J3sXTrym2ZR64pF3ZtN&opi=89978449) by [Stefan Muller](https://muellerstefan.net), published in the *Journal of Politics* in 2020, as an example. First, we vectorize the `cc_muller$text` using the Sentence function to transform it into a list object. Then, we reformat `cc_muller$class_pro_retro` as a factor. It's essential to note that R handles numerical values differently than Python. In R, numerical values are represented with a floating point, so it's advisable to convert them into factors or strings. Lastly, we employ the map function from the purrr package to assign labels to each sentence corpus using the `$add_label` method. +
```{r} library(purrr) @@ -229,10 +264,14 @@ test <- text[!sample] sprintf("Corpus object sizes - Train: %d | Test: %d", length(train), length(test)) ``` +
+ If you don't provide a dev set, flaiR will not force you to carve out a portion of your test set to serve as a dev set. However, in some cases when only the train and test sets are provided without a dev set, flaiR might automatically take a fraction of the train set (e.g., 10%) to use as a dev set ([#2259](https://github.com/flairNLP/flair/issues/2259#issuecomment-830040253)). This is to offer a mechanism for model selection and to prevent the model from overfitting on the train set. In the "Corpus" function, there is a random selection of the "dev" dataset. To ensure reproducibility, we need to set the seed in the flaiR framework. We can accomplish this by calling the top-level module "flair" from {flaiR} and using `$set_seed(1964L)` to set the seed. +
+ ```{r} flair <- import_flair() flair$set_seed(1964L) @@ -248,6 +287,8 @@ sprintf("Corpus object sizes - Train: %d | Test: %d | Dev: %d", length(corp$dev)) ``` +
+ In the later sections, there will be more similar processing using the `Corpus`. Following that, we will focus on advanced NLP applications.
@@ -256,12 +297,15 @@ In the later sections, there will be more similar processing using the `Corpus`. # Sequence Taggings -## **Tag Entities in Text** +## Tag Entities in Text
Let's run named entity recognition over the following example sentence: "I love Berlin and New York". To do this, all you need to do is make a Sentence object for this text, load a pre-trained model and use it to predict tags for the object. +
+ + ```{r} # attach flaiR in R library(flaiR) @@ -286,7 +330,11 @@ To print all annotations: print(sentence) ``` +
+ Use a for loop to print out each POS tag. It's important to note that Python is indexed from 0. Therefore, in an R environment, we must use `seq_along(sentence$get_labels()) - 1`. +
+ ```{r} for (i in seq_along(sentence$get_labels())) { @@ -296,11 +344,12 @@ for (i in seq_along(sentence$get_labels())) { -## **Tag Part-of-Speech in Text** +## Tag Part-of-Speech
We use flaiR/POS-english for POS tagging in the standard models on Hugging Face. +
```{r} # attach flaiR in R @@ -314,12 +363,49 @@ sentence <- Sentence('I love Berlin and New York.') Classifier <- flair_nn()$Classifier tagger <- Classifier$load('pos') +``` + +Penn Treebank POS Tags Reference + + +| Tag | Description | Example | +|-------|-------------|---------| +| DT | Determiner | the, a, these | +| NN | Noun, singular | cat, tree | +| NNS | Noun, plural | cats, trees | +| NNP | Proper noun, singular | John, London | +| NNPS | Proper noun, plural | Americans | +| VB | Verb, base form | take | +| VBD | Verb, past tense | took | +| VBG | Verb, gerund/present participle | taking | +| VBN | Verb, past participle | taken | +| VBP | Verb, non-3rd person singular present | take | +| VBZ | Verb, 3rd person singular present | takes | +| JJ | Adjective | big | +| RB | Adverb | quickly | +| O | Other | - | +| , | Comma | , | +| . | Period | . | +| : | Colon | : | +| -LRB- | Left bracket | ( | +| -RRB- | Right bracket | ) | +| `` | Opening quotation | " | +| '' | Closing quotation | " | +| HYPH | Hyphen | - | +| CD | Cardinal number | 1, 2, 3 | +| IN | Preposition | in, on, at | +| PRP | Personal pronoun | I, you, he | +| PRP$ | Possessive pronoun | my, your | +| UH | Interjection | oh, wow | +| FW | Foreign word | café | +| SYM | Symbol | +, % | + + +```{r} # run NER over sentence tagger$predict(sentence) ``` -To print all annotations: - ```{r} # print the sentence with all annotations print(sentence) @@ -332,10 +418,7 @@ for (i in seq_along(sentence$get_labels())) { print(sentence$get_labels()[[i]]) } ``` - - - -## **Detect Sentiment** +## Detect Sentiment
@@ -343,6 +426,8 @@ Let's run sentiment analysis over the same sentence to determine whether it is P You can do this with essentially the same code as above. Instead of loading the 'ner' model, you now load the 'sentiment' model: +
+ ```{r} # attach flaiR in R library(flaiR) @@ -364,89 +449,163 @@ tagger$predict(sentence) print(sentence) ``` - ------------------------------------------------------------------------ -### **Tagging Parts-of-Speech with Flair Models** +## Dealing with Dataframe -
+### Parts-of-Speech Tagging Across Full DataFrame -You can load the pre-trained model `"pos-fast"`. For more pre-trained models, see . +
-```{r} -texts <- c("UCD is one of the best universities in Ireland.", - "UCD has a good campus but is very far from my apartment in Dublin.", - "Essex is famous for social science research.", - "Essex is not in the Russell Group, but it is famous for political science research and in 1994 Group.", - "TCD is the oldest university in Ireland.", - "TCD is similar to Oxford.") +You can apply Part-of-Speech (POS) tagging across an entire DataFrame using Flair's pre-trained models. Let's walk through an example using the pos-fast model. You can apply Part-of-Speech (POS) tagging across an entire DataFrame using Flair's pre-trained models. Let's walk through an example using the pos-fast model. +First, let's load our required packages and sample data: -doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") -``` +
```{r} library(flaiR) +data(uk_immigration) +uk_immigration <- uk_immigration[1:2,] ``` +
+ +For POS tagging, we'll use Flair's pre-trained model. The pos-fast model offers a good balance between speed and accuracy. For more pre-trained models, check out Flair's documentation at Flair POS Tagging Documentation. There are two ways to load the POS tagger: + +- Load with tag dictionary display (default): + +
+ + ```{r} tagger_pos <- load_tagger_pos("pos-fast") ``` +This will show you all available POS tags grouped by categories (nouns, verbs, adjectives, etc.). + +- Load without tag display for a cleaner output: +```{r} +# 載入但不顯示標籤 +pos_tagger <- load_tagger_pos("pos-fast", + show_tags = FALSE) +``` + +Now we can process our texts: ```{r} -results <- get_pos(texts, doc_ids, tagger_pos) +results <- get_pos(texts = uk_immigration$text, + doc_ids = uk_immigration$speaker, + show.text_id = TRUE, + tagger = pos_tagger) + head(results, n = 10) ``` -
-### **Tagging Entities with Flair Models** +### Tagging Entities Across Full DataFrame
+This section focuses on performing Named Entity Recognition (NER) on data stored in a dataframe format. My goal is to identify and tag named entities within text that is organized in a structured dataframe. + + +I load the flaiR package and use the built-in uk_immigration dataset. For demonstration purposes, I'm only taking the first two rows. This dataset contains discussions about immigration in the UK. + + Load the pre-trained model `ner`. For more pre-trained models, see . +
+ + ```{r} library(flaiR) +data(uk_immigration) +uk_immigration <- head(uk_immigration, n = 2) ``` +
-```{r} -tagger_ner <- load_tagger_ner("ner") -``` +Next, I load the latest model hosted and maintained on Hugging Face by the Flair NLP team. For more Flair NER models, you can visit the official Flair NLP page on Hugging Face (https://huggingface.co/flair). + +
```{r} -results <- get_entities(texts, doc_ids, tagger_ner) -head(results, n = 10) +# Load model without displaying tags +# tagger <- load_tagger_ner("flair/ner-english-large", show_tags = FALSE) + +library(flaiR) +tagger_ner <- load_tagger_ner("flair/ner-english-ontonotes") ``` - + + + + -### **Tagging Sentiment** -
+ + + -Load the pre-trained model "`sentiment`". The pre-trained models of "`sentiment`", "`sentiment-fast`", and "`de-offensive-language`" are currently available. For more pre-trained models, see . + -```{r} -library(flaiR) -``` + + + -```{r} -tagger_sent <- load_tagger_sentiments("sentiment") -``` + + + + + + + +
+ +I load a pre-trained NER model. Since I'm using a Mac M1/M2, I set the model to run on the MPS device for faster processing. If I want to use other pre-trained models, I can check the Flair documentation website for available options. + +
+ +Now I'm ready to process the text: ```{r} -results <- get_sentiments(texts, doc_ids, tagger_sent) +results <- get_entities(texts = uk_immigration$text, + doc_ids = uk_immigration$speaker, + tagger = tagger_ner, + batch_size = 2, + verbose = FALSE) head(results, n = 10) ``` -
------------------------------------------------------------------------- + + + + + + + + + + + + + + + + + + + + + + + + + @@ -543,13 +702,16 @@ head(results, n = 10) ------------------------------------------------------------------------ -# Flair Embedding +# Embedding
Flair is a very popular natural language processing library, providing a variety of embedding methods for text representation. Flair Embeddings is a word embedding framework developed by [Zalando](https://engineering.zalando.com/posts/2018/11/zalando-research-releases-flair.html). It focuses on word-level representation and can capture contextual information of words, allowing the same word to have different embeddings in different contexts. Unlike traditional word embeddings (such as Word2Vec or GloVe), Flair can dynamically generate word embeddings based on context and has achieved excellent results in various NLP tasks. Below are some key points about Flair Embeddings: +
+ **Context-Aware** +
Flair is a dynamic word embedding technique that can understand the meaning of words based on context. In contrast, static word embeddings, such as Word2Vec or GloVe, provide a fixed embedding for each word without considering its context in a sentence. @@ -566,6 +728,8 @@ Here, the word "bank" has two different meanings. In the first sentence, it refe For static embeddings, the word "bank" might have an embedding that lies somewhere between these two meanings because it doesn't consider context. But for dynamic embeddings like Flair, "bank" in the first sentence will have an embedding related to rivers, and in the second sentence, it will have an embedding related to finance. +
+ ```{r} FlairEmbeddings <- flair_embeddings()$FlairEmbeddings Sentence <- flair_data()$Sentence @@ -588,10 +752,14 @@ bank_embedding_sentence1 = sentence1[5]$embedding # "bank" is the seventh word bank_embedding_sentence2 = sentence2[6]$embedding # "bank" is the sixth word ``` +
+ Same word, similar vector representation, but essentially different. In this way, you can see how the dynamic embeddings for "bank" in the two sentences differ based on context. Although we printed the embeddings here, in reality, they would be high-dimensional vectors, so you might see a lot of numbers. If you want a more intuitive view of the differences, you could compute the cosine similarity or other metrics between the two embeddings. This is just a simple demonstration. In practice, you can also combine multiple embedding techniques, such as `WordEmbeddings` and `FlairEmbeddings`, to get richer word vectors. +
+ ```{r} library(lsa) cosine(as.numeric( bank_embedding_sentence1$numpy()), @@ -600,9 +768,13 @@ cosine(as.numeric( bank_embedding_sentence1$numpy()), **Character-Based** +
+ Flair uses a character-level language model, meaning it can generate embeddings for rare words or even misspelled words. This is an important feature because it allows the model to understand and process words that have never appeared in the training data. Flair uses a bidirectional LSTM (Long Short-Term Memory) network that operates at a character level. This allows it to feed individual characters into the LSTM instead of words. +
**Multilingual Support** +
Flair provides various pre-trained character-level language models, supporting contextual word embeddings for multiple languages. It allows you to easily combine different word embeddings (e.g., Flair Embeddings, Word2Vec, GloVe, etc.) to create powerful stacked embeddings. @@ -616,6 +788,8 @@ Flair provides various pre-trained character-level language models, supporting c In Flair, the simplest form of embeddings that still contains semantic information about the word are called classic word embeddings. These embeddings are pre-trained and non-contextual. Let's retrieve a few word embeddings and use FastText embeddings with the following code. To do so, we simply instantiate a WordEmbeddings class by passing in the ID of the embedding of our choice. Then, we simply wrap our text into a Sentence object, and call the `embed(sentence)` method on our WordEmbeddings class. +
+ ```{r} WordEmbeddings <- flair_embeddings()$WordEmbeddings @@ -630,8 +804,12 @@ for (i in seq_along(sentence$tokens)) { ``` +
+ Flair supports a range of classic word embeddings, each offering unique features and application scopes. Below is an overview, detailing the ID required to load each embedding and its corresponding language. +
+ | Embedding Type | ID | Language | |-----------------------------|---------|------------| | GloVe | glove | English | @@ -673,7 +851,6 @@ Flair supports a range of classic word embeddings, each offering unique features ------------------------------------------------------------------------- ## Contexual Embeddings @@ -683,6 +860,9 @@ The idea behind contextual string embeddings is that each word embedding should Because we are using a forward model, it only takes into account the context that occurs before a word. Additionally, since our word has no context on the left-hand side of its position in the sentence, the two embeddings are identical, and the code assumes they are identical, indeed output is **True**. + + + ```{r} FlairEmbeddings <- flair_embeddings()$FlairEmbeddings embedding <- FlairEmbeddings('news-forward') @@ -694,8 +874,13 @@ embedding$embed(s2) cat(" s1 sentence:", paste(s1[0], sep = ""), "\n", "s2 sentence:", paste(s2[0], sep = "")) ``` +
+ We test whether the sum of the two 2048 embeddings of `nice` is equal to 2048. If it is true, it indicates that the embedding results are consistent, which should theoretically be the case. +
+ + ```{r} length(s1[0]$embedding$numpy()) == sum(s1[0]$embedding$numpy() == s2[0]$embedding$numpy()) ``` @@ -716,10 +901,15 @@ The two sets of embeddings are not identical because the words are different, so length(s1[0]$embedding$numpy()) == sum(s1[0]$embedding$numpy() == s2[0]$embedding$numpy()) ``` +
+ The measure of similarity between two vectors in an inner product space is known as cosine similarity. The formula for calculating cosine similarity between two vectors, such as vectors A and B, is as follows: $Cosine Similarity = \frac{\sum_{i} (A_i \cdot B_i)}{\sqrt{\sum_{i} (A_i^2)} \cdot \sqrt{\sum_{i} (B_i^2)}}$ + +
+ ```{r} library(lsa) vector1 <- as.numeric(s1[0]$embedding$numpy()) @@ -733,7 +923,7 @@ cosine_similarity <- cosine(vector1, vector2) print(cosine_similarity) ``` - + ------------------------------------------------------------------------ @@ -744,6 +934,8 @@ print(cosine_similarity) First, we utilize the `flair.embeddings.TransformerWordEmbeddings` function to download BERT, and more transformer models can also be found on [Flair NLP's Hugging Face](https://huggingface.co/flair). + + ```{r} library(flaiR) TransformerWordEmbeddings <- flair_embeddings()$TransformerWordEmbeddings("bert-base-uncased") @@ -767,115 +959,447 @@ for (i in seq_along(sentence$tokens)) { } ``` - ------------------------------------------------------------------------ -# Training a Binary Classifier +## Visialized Embeddings -
-In this section, we'll train a sentiment analysis model that can categorize text as either positive or negative. This case study is adapted from pages 116 to 130 of Tadej Magajna's book, '[Natural Language Processing with Flair](https://www.packtpub.com/product/natural-language-processing-with-flair/9781801072311)'. The process for training text classifiers in Flair mirrors the process followed for sequence labeling models. Specifically, the steps to train text classifiers are: +### Word Embeddings (GloVe) -- Load a tagged corpus and compute the label dictionary map. -- Prepare the document embeddings. -- Initialize the `TextClassifier` class. -- Train the model. +- GloVe embeddings are Pytorch vectors of dimensionality 100. -
+- For English, Flair provides a few more options. Here, you can use `en-glove` and `en-extvec` with the __WordEmbeddings__ class. +```{r} +# Initialize Text Processing Tools --------------------------- +# Import Sentence class for text operations +Sentence <- flair_data()$Sentence -## Loading a Tagged Corpus +# Configure GloVe Embeddings -------------------------------- +# Load WordEmbeddings class and initialize GloVe model +WordEmbeddings <- flair_embeddings()$WordEmbeddings +embedding <- WordEmbeddings("glove") -
+# Text Processing and Embedding ----------------------------- +# Create sentence with semantic relationship pairs +sentence <- Sentence("King Queen man woman Paris London apple orange Taiwan Dublin Bamberg") -Training text classification models requires a set of text documents (typically, sentences or paragraphs) where each document is associated with one or more classification labels. To train our sentiment analysis text classification model, we will be using the famous Internet Movie Database (IMDb) dataset, which contains 50,000 movie reviews from IMDB, where each review is labeled as either positive or negative. References to this dataset are already baked into Flair, so loading the dataset couldn't be easier: +# Apply GloVe embeddings to the sentence +embedding$embed(sentence) -```{r} -library(flaiR) -# load IMDB from flair_datasets module -Corpus <- flair_data()$Corpus -IMDB <- flair_datasets()$IMDB -``` +# Extract embeddings into matrix format +sen_df <- process_embeddings(sentence, + verbose = TRUE) -```{r} -# downsize to 0.05 -corpus = IMDB() -corpus$downsample(0.05) -``` +# Dimensionality Reduction --------------------------------- +# Set random seed for reproducibility +set.seed(123) -Print the sizes in the corpus object as follows - test: %d \| train: %d \| dev: %d" +# Apply PCA to reduce dimensions to 3 components +pca_result <- prcomp(sen_df, center = TRUE, scale. = TRUE) -```{r} -test_size <- length(corpus$test) -train_size <- length(corpus$train) -dev_size <- length(corpus$dev) -output <- sprintf("Corpus object sizes - Test: %d | Train: %d | Dev: %d", test_size, train_size, dev_size) -print(output) +# Extract first three principal components +word_embeddings_matrix <- as.data.frame(pca_result$x[,1:3]) +word_embeddings_matrix ``` -```{r} -lbl_type = 'sentiment' -label_dict = corpus$make_label_dictionary(label_type=lbl_type) -``` -
-## Loading the Embeddings -
+ + + + + + + + -flaiR covers all the different types of document embeddings that we can use. Here, we simply use `DocumentPoolEmbeddings`. They require no training prior to training the classification model itself: + + + + -```{r} -DocumentPoolEmbeddings <- flair_embeddings()$DocumentPoolEmbeddings -WordEmbeddings <- flair_embeddings()$WordEmbeddings -glove = WordEmbeddings('glove') -document_embeddings = DocumentPoolEmbeddings(glove) -``` + + + + + + + -
+ + + -## Initializing the TextClassifier + + + + -
-```{r} -# initiate TextClassifier -TextClassifier <- flair_models()$TextClassifier -classifier <- TextClassifier(document_embeddings, - label_dictionary = label_dict, - label_type = lbl_type) +#### 2D Plot +```{r, out.width="95%" } +library(ggplot2) +glove_plot2D <- ggplot(word_embeddings_matrix, aes(x = PC1, y = PC2, color = PC3, + label = rownames(word_embeddings_matrix))) + + geom_point(size = 3) + + geom_text(vjust = 1.5, hjust = 0.5) + + scale_color_gradient(low = "blue", high = "red") + + theme_minimal() + + labs(title = "", x = "PC1", y = "PC2", color = "PC3") + # guides(color = "none") +glove_plot2D ``` -`$to` allows you to set the device to use CPU, GPU, or specific MPS devices on Mac (such as mps:0, mps:1, mps:2). +#### 3D Plot -```{r eval=FALSE, include=TRUE} -classifier$to(flair_device("mps")) -``` +[plotly](https://plotly.com/r/) in R API: https://plotly.com/r/ -``` -TextClassifier( - (embeddings): DocumentPoolEmbeddings( - fine_tune_mode=none, pooling=mean - (embeddings): StackedEmbeddings( - (list_embedding_0): WordEmbeddings( - 'glove' - (embedding): Embedding(400001, 100) - ) - ) - ) - (decoder): Linear(in_features=100, out_features=3, bias=True) - (dropout): Dropout(p=0.0, inplace=False) - (locked_dropout): LockedDropout(p=0.0) - (word_dropout): WordDropout(p=0.0) - (loss_function): CrossEntropyLoss() -) +```{r, message = FALSE, warning = FALSE, out.width="95%"} +library(plotly) +glove_plot3D <- plot_ly(data = word_embeddings_matrix, + x = ~PC1, y = ~PC2, z = ~PC3, + type = "scatter3d", mode = "markers", + marker = list(size = 5), + text = rownames(word_embeddings_matrix), hoverinfo = 'text') + +glove_plot3D ``` -
-## Training the Model + +### Stack Embeddings Method (GloVe + Back/forwad FlairEmbeddings or More) + +```{r} +# Initialize Embeddings ----------------------------- +# Load embedding types from flaiR +WordEmbeddings <- flair_embeddings()$WordEmbeddings +FlairEmbeddings <- flair_embeddings()$FlairEmbeddings +StackedEmbeddings <- flair_embeddings()$StackedEmbeddings + +# Configure Embeddings ---------------------------- +# Initialize GloVe word embeddings +glove_embedding <- WordEmbeddings('glove') + +# Initialize Flair contextual embeddings +flair_embedding_forward <- FlairEmbeddings('news-forward') +flair_embedding_backward <- FlairEmbeddings('news-backward') + +# Initialize GloVe for individual use +embedding <- WordEmbeddings("glove") + +# Create stacked embeddings combining GloVe and bidirectional Flair +stacked_embeddings <- StackedEmbeddings(c(glove_embedding, + flair_embedding_forward, + flair_embedding_backward)) + +# Text Processing -------------------------------- +# Load Sentence class from flaiR +Sentence <- flair_data()$Sentence + +# Create test sentence with semantic relationships +sentence <- Sentence("King Queen man woman Paris London apple orange Taiwan Dublin Bamberg") + +# Apply embeddings and extract features ---------- +# Embed text using stacked embeddings +stacked_embeddings$embed(sentence) + +# Extract embeddings matrix with processing details +sen_df <- process_embeddings(sentence, + verbose = TRUE) + +# Dimensionality Reduction ----------------------- +set.seed(123) + +# Perform PCA for visualization +pca_result <- prcomp(sen_df, center = TRUE, scale. = TRUE) + +# Extract first three principal components +word_embeddings_matrix <- as.data.frame(pca_result$x[,1:3]) +word_embeddings_matrix +``` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +```{r, out.width="95%" } +# 2D Plot +library(ggplot2) + +stacked_plot2D <- ggplot(word_embeddings_matrix, aes(x = PC1, y = PC2, color = PC3, + label = rownames(word_embeddings_matrix))) + + geom_point(size = 2) + + geom_text(vjust = 1.5, hjust = 0.5) + + scale_color_gradient(low = "blue", high = "red") + + theme_minimal() + + labs(title = "", x = "PC1", y = "PC2", color = "PC3") + # guides(color = "none") + +stacked_plot2D +``` + + +### Transformer Embeddings (BERT or More) + +```{r} +# Load Required Package ---------------------------- +library(flaiR) + +# Initialize BERT and Text Processing -------------- +# Import Sentence class for text operations +Sentence <- flair_data()$Sentence + +# Initialize BERT model (base uncased version) +TransformerWordEmbeddings <- flair_embeddings()$TransformerWordEmbeddings("bert-base-uncased") + +# Text Processing and Embedding -------------------- +# Create sentence with semantic relationship pairs +sentence <- Sentence("King Queen man woman Paris London apple orange Taiwan Dublin Bamberg") + +# Apply BERT embeddings to the sentence +TransformerWordEmbeddings$embed(sentence) + +# Extract embeddings into matrix format +sen_df <- process_embeddings(sentence, verbose = TRUE) + +# Dimensionality Reduction ------------------------ +# Set random seed for reproducibility +set.seed(123) + +# Apply PCA to reduce dimensions to 3 components +pca_result <- prcomp(sen_df, center = TRUE, scale. = TRUE) + +# Extract first three principal components +word_embeddings_matrix <- as.data.frame(pca_result$x[,1:3]) +word_embeddings_matrix +``` + + + + + + + + + + + + + + + + + + + + + + + + + +```{r, out.width="95%" } +# 2D Plot +library(ggplot2) + +bert_plot2D <- ggplot(word_embeddings_matrix, aes(x = PC1, y = PC2, color = PC3, + label = rownames(word_embeddings_matrix))) + + geom_point(size = 2) + + geom_text(vjust = 1.5, hjust = 0.5) + + scale_color_gradient(low = "blue", high = "red") + + theme_minimal() + + labs(title = "", x = "PC1", y = "PC2", color = "PC3") + # guides(color = "none") + +stacked_plot2D +``` + +### Embedding Models Comparison +```{r, out.width="95%" } +library(ggpubr) + +figure <- ggarrange(glove_plot2D, stacked_plot2D, bert_plot2D, + labels = c("Glove", "Stacked Embedding", "BERT"), + ncol = 3, nrow = 1, + common.legend = TRUE, + legend = "bottom", + font.label = list(size = 8)) + +figure +``` + +------------------------------------------------------------------------ + + +# Training a Binary Classifier + +
+ +In this section, we'll train a sentiment analysis model that can categorize text as either positive or negative. This case study is adapted from pages 116 to 130 of Tadej Magajna's book, '[Natural Language Processing with Flair](https://www.packtpub.com/product/natural-language-processing-with-flair/9781801072311)'. The process for training text classifiers in Flair mirrors the process followed for sequence labeling models. Specifically, the steps to train text classifiers are: + +- Load a tagged corpus and compute the label dictionary map. +- Prepare the document embeddings. +- Initialize the `TextClassifier` class. +- Train the model. + +
+ + +## Loading a Tagged Corpus + +
+ +Training text classification models requires a set of text documents (typically, sentences or paragraphs) where each document is associated with one or more classification labels. To train our sentiment analysis text classification model, we will be using the famous Internet Movie Database (IMDb) dataset, which contains 50,000 movie reviews from IMDB, where each review is labeled as either positive or negative. References to this dataset are already baked into Flair, so loading the dataset couldn't be easier: + +
+ + +```{r} +library(flaiR) +# load IMDB from flair_datasets module +Corpus <- flair_data()$Corpus +IMDB <- flair_datasets()$IMDB +``` + +```{r} +# downsize to 0.05 +corpus = IMDB() +corpus$downsample(0.05) +``` + +Print the sizes in the corpus object as follows - test: %d \| train: %d \| dev: %d" + +```{r} +test_size <- length(corpus$test) +train_size <- length(corpus$train) +dev_size <- length(corpus$dev) +output <- sprintf("Corpus object sizes - Test: %d | Train: %d | Dev: %d", test_size, train_size, dev_size) +print(output) +``` + +```{r} +lbl_type = 'sentiment' +label_dict = corpus$make_label_dictionary(label_type=lbl_type) +``` + + +## Loading the Embeddings + +
+ +flaiR covers all the different types of document embeddings that we can use. Here, we simply use `DocumentPoolEmbeddings`. They require no training prior to training the classification model itself: +
+ + +```{r} +DocumentPoolEmbeddings <- flair_embeddings()$DocumentPoolEmbeddings +WordEmbeddings <- flair_embeddings()$WordEmbeddings +glove = WordEmbeddings('glove') +document_embeddings = DocumentPoolEmbeddings(glove) +``` + + + +## Initializing the TextClassifier + +```{r} +# initiate TextClassifier +TextClassifier <- flair_models()$TextClassifier +classifier <- TextClassifier(document_embeddings, + label_dictionary = label_dict, + label_type = lbl_type) +``` + +
+ +`$to` allows you to set the device to use CPU, GPU, or specific MPS devices on Mac (such as mps:0, mps:1, mps:2). + +
+ +```{r eval=FALSE, include=TRUE} +classifier$to(flair_device("mps")) +``` + +``` +TextClassifier( + (embeddings): DocumentPoolEmbeddings( + fine_tune_mode=none, pooling=mean + (embeddings): StackedEmbeddings( + (list_embedding_0): WordEmbeddings( + 'glove' + (embedding): Embedding(400001, 100) + ) + ) + ) + (decoder): Linear(in_features=100, out_features=3, bias=True) + (dropout): Dropout(p=0.0, inplace=False) + (locked_dropout): LockedDropout(p=0.0) + (word_dropout): WordDropout(p=0.0) + (loss_function): CrossEntropyLoss() +) +``` + + +## Training the Model
@@ -886,6 +1410,9 @@ Training the text classifier model involves two simple steps: **It is worth noting that the 'L' in numbers like 32L and 5L is used in R to denote that the number is an integer. Without the 'L' suffix, numbers in R are treated as numeric, which are by default double-precision floating-point numbers. In contrast, Python determines the type based on the value of the number itself. Whole numbers (e.g., 5 or 32) are of type int, while numbers with decimal points (e.g., 5.0) are of type float. Floating-point numbers in both languages are representations of real numbers but can have some approximation due to the way they are stored in memory.** +
+ + ```{r} # initiate ModelTrainer ModelTrainer <- flair_trainers()$ModelTrainer @@ -903,7 +1430,6 @@ trainer$train('classifier', max_epochs=10L) ``` - ## Loading and Using the Classifiers @@ -912,6 +1438,8 @@ trainer$train('classifier', After training the text classification model, the resulting classifier will already be stored in memory as part of the classifier variable. It is possible, however, that your Python session exited after training. If so, you'll need to load the model into memory with the following: + + ```{r} TextClassifier <- flair_models()$TextClassifier classifier <- TextClassifier$load('classifier/best-model.pt') @@ -935,7 +1463,6 @@ classifier$predict(sentence) print(sentence$labels) ``` - ------------------------------------------------------------------------ @@ -948,7 +1475,6 @@ Here, we train a sentiment analysis model to categorize text. In this case, we a ## Import Necessary Modules -
```{r} @@ -960,7 +1486,6 @@ TextClassifier <- flair_models()$TextClassifier ModelTrainer <- flair_trainers()$ModelTrainer ``` -
## Get the IMDB Corpus @@ -968,6 +1493,9 @@ ModelTrainer <- flair_trainers()$ModelTrainer The IMDB movie review dataset is used here, which is a commonly utilized dataset for sentiment analysis. `$downsample(0.1)` method means only 10% of the dataset is used, allowing for a faster demonstration. + + + ```{r} # load the IMDB file and downsize it to 0.1 IMDB <- flair_datasets()$IMDB @@ -976,13 +1504,14 @@ corpus <- IMDB()$downsample(0.1) lbl_type <- 'sentiment' label_dict <- corpus$make_label_dictionary(label_type=lbl_type) ``` - ## Stacked Embeddings
This is one of Flair's most powerful features: it allows for the integration of embeddings to enable the model to learn from more sparse features. Three types of embeddings are utilized here: GloVe embeddings, and two types of Flair embeddings (forward and backward). Word embeddings are used to convert words into vectors. +
+ ```{r} # make a list of word embeddings @@ -1005,7 +1534,6 @@ classifier <- TextClassifier(document_embeddings, # initialize the text classifier trainer with our corpus trainer <- ModelTrainer(classifier, corpus) ``` - ## Start the Training @@ -1016,6 +1544,8 @@ For the sake of this example, setting max_epochs to 5. You might want to increas It is worth noting that the learning rate is a parameter that determines the step size at each iteration while moving towards a minimum of the loss function. A smaller learning rate could slow down the learning process, but it could lead to more precise convergence. `mini_batch_size` determines the number of samples that will be used to compute the gradient at each step. The 'L' in 32L is used in R to denote that the number is an integer. `patience` (aka early stop) is a hyper-parameter used in conjunction with early stopping to avoid overfitting. It determines the number of epochs the training process will tolerate without improvements before stopping the training. Setting max_epochs to 5 means the algorithm will make five passes through the dataset. + + ```{r eval=FALSE, include=TRUE} # note: the 'L' in 32L is used in R to denote that the number is an integer. @@ -1025,19 +1555,16 @@ trainer$train('models/sentiment', patience=5L, max_epochs=5L) ``` - ## To Apply the Trained Model for Prediction -
```{r eval=FALSE, include=TRUE} sentence <- "This movie was really exciting!" classifier$predict(sentence) print(sentence.labels) ``` -
------------------------------------------------------------------------ @@ -1046,16 +1573,16 @@ print(sentence.labels)
We use data from *The Temporal Focus of Campaign Communication (2020 JOP)* as an example. Let's assume we receive the data for training from different times. First, suppose you have a dataset of 1000 entries called `cc_muller_old`. On another day, with the help of nice friends, you receive another set of data, adding 2000 entries in a dataset called `cc_muller_new`. Both subsets are from `data(cc_muller)`. We will show how to fine-tune a transformer model with `cc_muller_old`, and then continue with another round of fine-tuning using `cc_muller_new`. +
+ ```{r} library(flaiR) ``` - ## Fine-tuning a Transformers Model -
**Step 1** Load Necessary Modules from Flair @@ -1078,8 +1605,13 @@ TextClassifier <- flair_models()$TextClassifier ModelTrainer <- flair_trainers()$ModelTrainer ``` +
+ We use purrr to help us split sentences using Sentence from `flair_data()`, then use map2 to add labels, and finally use `Corpus` to segment the data. +
+ + ```{r} library(purrr) @@ -1111,8 +1643,13 @@ old_test <- old_test[test_id] old_dev <- old_test[!test_id] ``` +
+ If you do not provide a development set (dev set) while using Flair, it will automatically split the training data into training and development datasets. The test set is used for training the model and evaluating its final performance, whereas the development set is used for adjusting model parameters and preventing overfitting, or in other words, for early stopping of the model. +
+ + ```{r} old_corpus <- Corpus(train = old_train, test = old_test) ``` @@ -1123,14 +1660,23 @@ old_corpus <- Corpus(train = old_train, test = old_test) document_embeddings <- TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=TRUE) ``` -First, the `$make_label_dictionary` function is used to automatically create a label dictionary for the classification task. The label dictionary is a mapping from label to index, which is used to map the labels to a tensor of label indices. Besides classification tasks, flaiR also supports other label types for training custom model, such as `ner`, `pos` and `sentiment`. From the cc_muller dataset: Future (seen 423 times), Present (seen 262 times), Past (seen 131 times) +
+ +First, the `$make_label_dictionary` function is used to automatically create a label dictionary for the classification task. The label dictionary is a mapping from label to index, which is used to map the labels to a tensor of label indices. Besides classification tasks, flaiR also supports other label types for training custom model. From the cc_muller dataset: Future (seen 423 times), Present (seen 262 times), Past (seen 131 times). + +
+ ```{r} old_label_dict <- old_corpus$make_label_dictionary(label_type="classification") ``` +
+ `TextClassifier` is used to create a text classifier. The classifier takes the document embeddings (importing from `'distilbert-base-uncased'` from Hugging Face) and the label dictionary as input. The label type is also specified as classification. +
+ ```{r} old_classifier <- TextClassifier(document_embeddings, label_dictionary = old_label_dict, @@ -1153,7 +1699,6 @@ old_trainer$train("vignettes/inst/muller-campaign-communication", save_final_model=TRUE, max_epochs=1L) ``` -
## Continue Fine-tuning with New Dataset @@ -1203,5 +1748,529 @@ new_trainer$train("vignettes/inst/new-muller-campaign-communication", max_epochs=1L) ``` + +## Model Performance Metrics: Pre and Post Fine-tuning + +After fine-tuning for 1 epoch, the model showed improved performance on the same test set. + + +| Evaluation Metric | Pre-finetune | Post-finetune | Improvement | +|-------------------|--------------|---------------|-------------| +| F-score (micro) | 0.7294 | 0.8471 | +0.1177 | +| F-score (macro) | 0.7689 | 0.8583 | +0.0894 | +| Accuracy | 0.7294 | 0.8471 | +0.1177 | + More R tutorial and documentation see [here](https://github.com/davidycliao/flaiR). + + +------------------------------------------------------------------------ + +# Extending conText's Embedding Regression + +
+ +`ConText` is a fast, flexible, and transparent framework for estimating context-specific word and short document embeddings using [the 'a la carte' embeddings regression](https://github.com/prodriguezsosa/EmbeddingRegression), implemented by Rodriguez et al. + +In this case study, I'll demonstrate how to use the conText package alongside other embedding frameworks by working through the example provided in Rodriguez et al.'s [Quick Start Guide](https://github.com/prodriguezsosa/conText/blob/master/vignettes/quickstart.md). While ConText includes its own cross-lingual ALC Embeddings, this tutorial extends its capabilities by integrating it with flaiR. Through this tutorial integration, this tutorial shows how to: + +- Access flaiR's powerful embedding models + +- Connect with any transformer-based embedding models from HuggingFace via FlaiR + +I'll be following the example directly from [Rodriguez et al.'s Quick Start Guide]() as this case study. It's important to note that results obtained using alternative embedding frameworks may deviate from the original implementation, and should be interpreted with caution. These comparative results are primarily intended for reference and educational use. + +First of all, when loading the conText package, you'll find three pre-loaded datasets: `cr_sample_corpus`, `cr_glove_subset`, and `cr_transform.` These datasets are used in the package's tutorial to demonstrate preprocessing steps. For this exercise, I'll use `cr_sample_corpus` to explore other embedding frameworks, including: + +- `en-crawl` embedding +- Flair NLP contextual embeddings (as described in [Akbik et al., COLING 2018 paper](https://flairnlp.github.io/docs/tutorial-embeddings/flair-embeddings)) + +- Integrated embeddings extracted from transformers like BERT. + +
+ +## Build Document-Embedding-Matrix with Other Embedding Frameworks + +```{r include=FALSE} +library(conText) +library(quanteda) +library(dplyr) +library(text2vec) +library(flaiR) +``` + +**Step 1** Tokenize Text with `quanteda` and `conText` + +
+ +First, let's tokenize `cr_sample_corpus` using the tokens_context function from the conText package. + +
+ +```{r} +# tokenize corpus removing unnecessary (i.e. semantically uninformative) elements +toks <- tokens(cr_sample_corpus, remove_punct=T, remove_symbols=T, remove_numbers=T, remove_separators=T) + +# clean out stopwords and words with 2 or fewer characters +toks_nostop <- tokens_select(toks, pattern = stopwords("en"), selection = "remove", min_nchar=3) + +# only use features that appear at least 5 times in the corpus +feats <- dfm(toks_nostop, tolower=T, verbose = FALSE) %>% dfm_trim(min_termfreq = 5) %>% featnames() + +# leave the pads so that non-adjacent words will not become adjacent +toks_nostop_feats <- tokens_select(toks_nostop, feats, padding = TRUE) + +# build a tokenized corpus of contexts surrounding the target term "immigration" +immig_toks <- tokens_context(x = toks_nostop_feats, pattern = "immigr*", window = 6L) + +# build document-feature matrix +immig_dfm <- dfm(immig_toks) +``` + +**Step 2** Import Embedding Tools + +
+To facilitate the loading of different embedding types, I'll import the following __classes__ and __functions__ from flaiR: `WordEmbeddings`, `FlairEmbeddings`, `TransformerWordEmbeddings`, `StackedEmbeddings`, and `Sentence.` These components enable us to work with GloVe embeddings, Flair's contextual embeddings, and transformer-based embeddings from the HuggingFace library. +
+ +```{r include=FALSE} +# Load the flaiR library +library(flaiR) + +# Import embedding classes from flair +WordEmbeddings <- flair_embeddings()$WordEmbeddings + +# Import Sentence class for text processing +Sentence <- flair_data()$Sentence + +# Initialize FastText embeddings trained on Common Crawl +fasttext_embeddings <- WordEmbeddings('en-news') +``` + +
+Initialize a Flair Sentence object by concatenating cr_glove_subset row names. The collapse parameter ensures proper tokenization by adding space delimiters. Then, embed the sentence text using the loaded fasttext embeddings. +
+ +```{r} +# Combine all text into a single string and create a Flair sentence +sentence <- Sentence(paste(rownames(cr_glove_subset), collapse = " ")) + +# Apply FastText embeddings to the sentence +fasttext_embeddings$embed(sentence) +``` + +
+ +The `process_embeddings` function from flaiR extracts pre-embedded GloVe vectors from a sentence object and arranges them into a structured matrix. In this matrix, tokens are represented as rows, embedding dimensions as columns, and each row is labeled with its corresponding token text. +
+ +```{r} +fasttext_subset <- process_embeddings(sentence, verbose = TRUE) +``` +**Step 3** Computing Context-Specific Word Embeddings Using FastText + +
+Create a feature co-occurrence matrix (FCM) from tokenized text and transform pre-trained FastText embeddings using co-occurrence information. + +
+ +```{r} +# Create a feature co-occurrence matrix (FCM) from tokenized text +toks_fcm <- fcm(toks_nostop_feats, + context = "window", + window = 6, + count = "frequency", + tri = FALSE) + +# Transform pre-trained Glove embeddings using co-occurrence information +ft_transform <- compute_transform( + x = toks_fcm, + pre_trained = fasttext_subset, + weighting = 'log' +) +``` + +Calculate Document Embedding Matrix (DEM) using transformed FastText embeddings + +```{r} +# Calculate Document Embedding Matrix (DEM) using transformed FastText embeddings +immig_dem_ft <- dem(x = immig_dfm, + pre_trained = fasttext_subset, + transform = TRUE, + transform_matrix = ft_transform, + verbose = TRUE) +``` + +Show each document inherits its corresponding docvars. +```{r} +head(immig_dem_ft@docvars) +``` + + +**Step 4** Embedding Eegression + +```{r} +set.seed(2021L) +library(conText) +ft_model <- conText(formula = immigration ~ party + gender, + data = toks_nostop_feats, + pre_trained = fasttext_subset, + transform = TRUE, + transform_matrix = ft_transform, + confidence_level = 0.95, + permute = TRUE, + jackknife = TRUE, + num_permutations = 100, + window = 6, case_insensitive = TRUE, + verbose = FALSE) +``` + +extract D-dimensional beta coefficients. +```{r} +# The intercept in this case is the fastext embedding for female Democrats +# beta coefficients can be combined to get each group's fastext embedding +DF_wv <- ft_model['(Intercept)',] # (D)emocrat - (F)emale +DM_wv <- ft_model['(Intercept)',] + ft_model['gender_M',] # (D)emocrat - (M)ale +RF_wv <- ft_model['(Intercept)',] + ft_model['party_R',] # (R)epublican - (F)emale +RM_wv <- ft_model['(Intercept)',] + ft_model['party_R',] + ft_model['gender_M',] # (R)epublican - (M)ale +``` + + +```{r} +# nearest neighbors +nns(rbind(DF_wv,DM_wv), + N = 10, + pre_trained = fasttext_subset, + candidates = ft_model@features) +``` + +```{r} +library(ggplot2) +ggplot(ft_model@normed_coefficients, aes(x = coefficient, y = normed.estimate)) + + geom_errorbar(aes(ymin = lower.ci, ymax = upper.ci), width = 0.2) + + geom_point(size = 3) + + geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") + + theme_minimal() + + labs( + title = "Estimated Coefficients with 95% CIs", + x = "Variables", + y = "Normalized Estimate" + ) + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + +``` + +## Exploring Document-Embedding Matrix with conText Functions + +Check dimensions of the resulting matrix. +```{r} +# Calculate average document embeddings for immigration-related texts +immig_wv_ft <- matrix(colMeans(immig_dem_ft), + ncol = ncol(immig_dem_ft)) %>% `rownames<-`("immigration") +dim(immig_wv_ft) +``` + +to get group-specific embeddings, average within party +```{r} +immig_wv_ft_party <- dem_group(immig_dem_ft, + groups = immig_dem_ft@docvars$party) +dim(immig_wv_ft_party) +``` + + +Find nearest neighbors by party +```{r} +# find nearest neighbors by party +# setting as_list = FALSE combines each group's results into a single tibble (useful for joint plotting) +immig_nns_ft <- nns(immig_wv_ft_party, + pre_trained = fasttext_subset, + N = 5, + candidates = immig_wv_ft_party@features, + as_list = TRUE) +``` + +check out results for Republican. +```{r} +immig_nns_ft[["R"]] +``` + +check out results for Democrat +```{r} +immig_nns_ft[["D"]] +``` + +compute the cosine similarity between each party's embedding and a specific set of features +```{r} +cos_sim(immig_wv_ft_party, + pre_trained = fasttext_subset, + features = c('reform', 'enforcement'), as_list = FALSE) +``` + +compute the cosine similarity between each party's embedding and a specific set of features. + +```{r} +# Republican +nns_ratio(x = immig_wv_ft_party, + N = 15, + numerator = "R", + candidates = immig_wv_ft_party@features, + pre_trained = fasttext_subset, + verbose = FALSE) +``` + +```{r} +# Democrat +nns_ratio(x = immig_wv_ft_party, + N = 15, + numerator = "D", + candidates = immig_wv_ft_party@features, + pre_trained = fasttext_subset, + verbose = FALSE) +``` + +compute the cosine similarity between each party's embedding and a set of tokenized contexts +```{r} +immig_ncs <- ncs(x = immig_wv_ft_party, + contexts_dem = immig_dem_ft, + contexts = immig_toks, + N = 5, + as_list = TRUE) + +# nearest contexts to Republican embedding of target term +# note, these may included contexts originating from Democrat speakers +immig_ncs[["R"]] +``` + +```{r} +immig_ncs[["D"]] +``` + +## Comparative Analysis of A La Carte, Flair Stacked, and BERT Embeddings + +### Build A La Carte Document-Embedding-Matrix + +```{r} +# build a document-embedding-matrix +immig_dem <- dem(x = immig_dfm, pre_trained = cr_glove_subset, transform = TRUE, transform_matrix = cr_transform, verbose = TRUE) + +set.seed(2021L) +alc_model <- conText(formula = immigration ~ party + gender, + data = toks_nostop_feats, + pre_trained = cr_glove_subset, + transform = TRUE, + transform_matrix = cr_transform, + jackknife = TRUE, + confidence_level = 0.95, + permute = TRUE, + num_permutations = 100, + window = 6, + case_insensitive = TRUE, + verbose = FALSE) +``` + +#### Document Embedding Matrix Construction Using Flair Contextual Stacked Embeddings + +
+ +To facilitate the loading of different embedding types, I'll import the following __classes__ and __functions__ from flaiF: `WordEmbeddings`, `FlairEmbeddings`, `TransformerWordEmbeddings`, `StackedEmbeddings`, and `Sentence.` These components enable us to work with GloVe embeddings, Flair's contextual embeddings, and transformer-based embeddings from the HuggingFace library. + +
+ +```{r include=FALSE} +# Load the flaiR library +library(flaiR) + +# Import embedding classes from flair +WordEmbeddings <- flair_embeddings()$WordEmbeddings +FlairEmbeddings <- flair_embeddings()$FlairEmbeddings +TransformerWordEmbeddings <- flair_embeddings()$TransformerWordEmbeddings +StackedEmbeddings <- flair_embeddings()$StackedEmbeddings + +# Import Sentence class for text processing +Sentence <- flair_data()$Sentence + +# Initialize FastText embeddings trained on Common Crawl +fasttext_embeddings <- WordEmbeddings('en-crawl') + +# Initialize Flair's forward language model embeddings trained on news data +flair_forward <- FlairEmbeddings('news-forward') + +# Initialize Flair's backward language model embeddings trained on news data +flair_backward <- FlairEmbeddings('news-backward') + +``` + +
+ +Combine three different types of embeddings into a stacked embedding model. + +This creates a stacked embedding model that combines: + +- `FastText embeddings`: Captures general word semantics +- `Forward Flair`: Captures contextual information reading text left-to-right +- `Backward Flair`: Captures contextual information reading text right-to-left + +
+ +```{r} +stacked_embeddings <- StackedEmbeddings(list( + fasttext_embeddings, + flair_forward, + flair_backward +)) +``` + +```{r} + +fasttext_subset <- process_embeddings(sentence, verbose = TRUE) + +# Step 1: Create a Flair Sentence object from the text +sentence <- Sentence(paste(rownames(cr_glove_subset), collapse = " ")) + +# Step 2: Generate embeddings using our stacked model +stacked_embeddings$embed(sentence) + +# Step 3: Extract and store embeddings for each token +stacked_subset <- process_embeddings(sentence, verbose = TRUE) + +# Step 4: Compute transformation matrix + +st_transform <- compute_transform( + x = toks_fcm, + pre_trained = stacked_subset, + weighting = 'log' +) + +# Step 5: Generate document embeddings matrix +immig_dem_st <- dem( + x = immig_dfm, + pre_trained = stacked_subset, + transform = TRUE, + transform_matrix = st_transform, + verbose = TRUE +) + +# Step 6: Fit conText model for analysis +set.seed(2021L) +st_model <- conText(formula = immigration ~ party + gender, + data = toks_nostop_feats, + pre_trained = stacked_subset, + transform = TRUE, + transform_matrix = st_transform, + jackknife = TRUE, + confidence_level = 0.95, + permute = TRUE, + num_permutations = 100, + window = 6, + case_insensitive = TRUE, + verbose = FALSE) +``` + +#### Document Embedding Matrix Construction with BERT + +
+ +BERT embeddings provide powerful contextual representations through their bidirectional transformer architecture. These embeddings are good at understanding context from both directions within text, generating deep contextual representations through multiple transformer layers, and leveraging pre-training on large text corpora to achieve strong performance across NLP tasks. + +The classic BERT base model generates 768-dimensional embeddings for each token, providing rich semantic representations. + +By utilizing the Flair framework, we also can seamlessly integrate: + +- Multiple BERT variants like RoBERTa and DistilBERT +- Cross-lingual models such as XLM-RoBERTa +- Domain-adapted BERT models +- Any transformer model available on HuggingFace + +
+ + +```{r} +# Initialize BERT base uncased model embeddings from HuggingFace +bert_embeddings <- TransformerWordEmbeddings('bert-base-uncased') +``` + + +```{r} +# Step 1: Create a Flair Sentence object from the text +sentence <- Sentence(paste(rownames(cr_glove_subset), collapse = " ")) + +# Step 2: Generate embeddings using BERT model from HugginFace +bert_embeddings$embed(sentence) + +# Step 3: Extract and store embeddings for each token +bert_subset <- process_embeddings(sentence, verbose = TRUE) + +# Step 4: Compute transformation matrix +bt_transform <- compute_transform(x = toks_fcm, + pre_trained = bert_subset, + weighting = 'log') + +# Step 5: Generate document embeddings matrix +immig_dem_bt <- dem(x = immig_dfm, + pre_trained = bert_subset, + transform = TRUE, + transform_matrix = bt_transform, + verbose = TRUE) + +# Step 6: Fit conText model for analysis +set.seed(2021L) +bt_model <- conText(formula = immigration ~ party + gender, + data = toks_nostop_feats, + pre_trained = bert_subset, + transform = TRUE, + transform_matrix = bt_transform, + jackknife = TRUE, + confidence_level = 0.95, + permute = TRUE, + num_permutations = 100, + window = 6, + case_insensitive = TRUE, + verbose = FALSE) +``` + +#### Comparision of Different Embedding Approaches + +
+ +While this tutorial doesn't determine a definitive "best" approach, it's important to understand the key distinctions between word embedding methods. BERT, FastText, Flair Stacked Embeddings, and GloVe can be categorized into two groups: dynamic and static embeddings. + +Dynamic embeddings, such as BERT and Flair, adapt their word representations based on context using high-dimensional vector spaces (BERT uses 768 dimensions in its base model). These methods excel at handling contextual nuances: BERT employs self-attention mechanisms and subword tokenization, while Flair uses character-level modeling. Both effectively handle out-of-vocabulary words through these mechanisms. + +Static embeddings, like FastText (300 dimensions) and GloVe (typically 100-300 dimensions), maintain consistent word representations regardless of context. FastText stands out among static embeddings by incorporating subword information, giving it an advantage in handling out-of-vocabulary words, while GloVe relies on pre-computed global word-word co-occurrence statistics. + +Each embedding method has its unique characteristics and trade-offs. While they share the common goal of capturing meaningful textual relationships, they differ in their balance of contextual awareness, computational demands, and representation stability. The choice among them depends on specific project requirements, including accuracy needs, computational resources, and the nature of the text being analyzed. + +
+ + +```{r echo=FALSE, message = TRUE, warning = TRUE, out.width="95%"} +st <- as.data.frame(st_model@normed_coefficients) +st["model"] <- "Flair Stacked Embeddings" +ft <- as.data.frame(ft_model@normed_coefficients) +ft["model"] <- "FastTest" +bt <- as.data.frame(bt_model@normed_coefficients) +bt["model"] <- "BERT" +ac <- as.data.frame(alc_model@normed_coefficients) +ac["model"] <- "Glove" + +merged_df <- rbind(st,ft,bt,ac) + +library(ggplot2) +# Create a faceted plot comparing different embedding results in a 2x2 grid +ggplot(merged_df, aes(x = coefficient, y = normed.estimate)) + + geom_errorbar(aes(ymin = lower.ci, ymax = upper.ci), width = 0.2) + + geom_point(size = 3) + + # Add reference line at y=0 + geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") + + theme_minimal() + + labs( + title = "Estimated Coefficients with 95% CIs", + x = "", + y = "Normalized Estimate" + ) + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + + facet_wrap(~model, nrow = 2, ncol = 2, scales = "free_y") +``` + + + From cd2c2904ce53fba81a4a978db30c2f7fc08d47f3 Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sat, 28 Dec 2024 05:13:46 +0000 Subject: [PATCH 15/46] UPDATE 0.7.0 --- README.Rmd | 4 +- README.md | 6 +- tests/testthat/test_get_entities.R | 99 ------ tests/testthat/test_get_entities_batch.R | 51 --- tests/testthat/test_get_pos.R | 23 +- tests/testthat/test_get_pos_batch.R | 43 --- tests/testthat/test_get_sentiments.R | 91 ----- tests/testthat/test_get_sentiments_batch.R | 37 -- tests/testthat/test_util.R | 16 +- vignettes/tutorial.Rmd | 376 ++------------------- 10 files changed, 46 insertions(+), 700 deletions(-) delete mode 100644 tests/testthat/test_get_entities_batch.R delete mode 100644 tests/testthat/test_get_pos_batch.R delete mode 100644 tests/testthat/test_get_sentiments.R delete mode 100644 tests/testthat/test_get_sentiments_batch.R diff --git a/README.Rmd b/README.Rmd index 169c1eb4..2412ab50 100644 --- a/README.Rmd +++ b/README.Rmd @@ -35,7 +35,7 @@ system(paste(reticulate::py_config()$python, "-m pip install flair"))
-__flaiR__ is an R package for accessing the [flairNLP/flair](flairNLP/flair) Python library, maintained by [Yen-ChiehLiao](https://davidycliao.github.io) ([University of Birmingham](https://www.birmingham.ac.uk/research/centres-institutes/centre-for-artificial-intelligence-in-government)) and [Stefan Müller](https://muellerstefan.net) from [Next Generation Energy Systems](https://www.nexsys-energy.ie) and [Text and Policy Research Group](https://text-and-policy.com) in UCD. flaiR provides convenient access to the main functionalities of flairNLP for training word embedding-based deep learning models and fine-tune state-of-the-art transformers hosted on Hugging Face. Our team trains and fine-tunes the models with Flair in [our projects](). +__flaiR__ is an R package for accessing the [flairNLP/flair](flairNLP/flair) Python library, maintained by [Yen-Chieh Liao](https://davidycliao.github.io) ([University of Birmingham](https://www.birmingham.ac.uk/research/centres-institutes/centre-for-artificial-intelligence-in-government)) and [Stefan Müller](https://muellerstefan.net) from [Next Generation Energy Systems](https://www.nexsys-energy.ie) and [Text and Policy Research Group](https://text-and-policy.com) in UCD. flaiR provides convenient access to the main functionalities of flairNLP for training word embedding-based deep learning models and fine-tune state-of-the-art transformers hosted on Hugging Face. Our team trains and fine-tunes the models with Flair in [our projects]().
@@ -55,7 +55,6 @@ library(flaiR) ``` ``` r -library(flaiR) #> flaiR: An R Wrapper for Accessing Flair NLP 0.13.1 ``` @@ -88,6 +87,7 @@ __flaiR__ runs the Flair NLP backend in Python, thus requiring Python installati - [Tutorial for embeddings in flaiR](https://davidycliao.github.io/flaiR/articles/tutorial.html#embedding) +- [Extending conText’s Embedding Regression](https://davidycliao.github.io/flaiR/articles/tutorial.html#extending-contexts-embedding-regression)
diff --git a/README.md b/README.md index 00da0179..80ff200c 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ **flaiR** is an R package for accessing the [flairNLP/flair](flairNLP/flair) Python library, maintained by -[Yen-ChiehLiao](https://davidycliao.github.io) ([University of +[Yen-Chieh Liao](https://davidycliao.github.io) ([University of Birmingham](https://www.birmingham.ac.uk/research/centres-institutes/centre-for-artificial-intelligence-in-government)) and [Stefan Müller](https://muellerstefan.net) from [Next Generation Energy Systems](https://www.nexsys-energy.ie) and [Text and Policy @@ -41,7 +41,6 @@ library(flaiR) ``` ``` r -library(flaiR) #> flaiR: An R Wrapper for Accessing Flair NLP 0.13.1 ``` @@ -81,6 +80,9 @@ these specific versions. - [Tutorial for embeddings in flaiR](https://davidycliao.github.io/flaiR/articles/tutorial.html#embedding) +- [Extending conText’s Embedding + Regression](https://davidycliao.github.io/flaiR/articles/tutorial.html#extending-contexts-embedding-regression) +
## Contribution and Open Source diff --git a/tests/testthat/test_get_entities.R b/tests/testthat/test_get_entities.R index 249d252d..8b137891 100644 --- a/tests/testthat/test_get_entities.R +++ b/tests/testthat/test_get_entities.R @@ -1,100 +1 @@ -# Test 1: get_entities returns four entities for two input texts using "ner" -test_that("get_entities returns four entities for two input texts using 'ner'", { - result <- get_entities( - texts = c("UCD is one of the best university in Ireland. ", - "TCD in less better than Oxford"), - doc_ids = c("doc1", "doc2"), - language = "ner" - ) - # Check that the number of rows in the result matches the expected number of entities - expect_equal(nrow(result), 4) -}) -# Test 2: get_entities_batch returns four entities for two input texts. -test_that("get_entities_batch returns four entities for two input texts using 'ner'", { - result <- get_entities_batch( - texts = c("UCD is one of the best university in Ireland. ", - "TCD in less better than Oxford"), - doc_ids = c("doc1", "doc2"), - language = "ner", - batch_size = 5, - device = "cpu" - ) - # Check that the number of rows in the result matches the expected number of entities - expect_equal(nrow(result), 4) -}) - - -# Test 3: get_entities throws an error for mismatched lengths of texts and doc_ids -test_that("get_entities throws an error for mismatched lengths of texts and doc_ids", { - expect_error( - get_entities( - texts = "TCD in less better than Oxford", - doc_ids = c("doc1", "doc2"), - language = "ner" - ), - "The lengths of texts and doc_ids do not match." - ) -}) - -# Test 4: get_entities returns NA for the "tag" field when there are mismatched -# lengths of texts and doc_ids -test_that("get_entities returns NA for the 'tag' field.", { - result <- get_entities( - texts = "TCD in less better than Oxford", - doc_ids = "doc1", - language = "ner" - ) - # Check that the "tag" field is NA - expect_true(is.na(result[3, "tag"]$tag)) -}) - -# Test 5: get_entities returns NA for the "tag" field when the input text -# does not contain entities -test_that("get_entities returns NA for the 'tag' field", { - result <- get_entities( - texts = "xxxxxxxxx", - doc_ids = "doc1", - language = "ner" - ) - # Check that the "tag" field is NA - expect_true(is.na(result[1, "tag"]$tag)) -}) - -# Test 6: get_entities returns NA for the "tag" field when the input text is NA -test_that("get_entities returns NA for the 'tag' field.", { - result <- get_entities( - texts = NA, - doc_ids = NA, - language = "ner" - ) - # Check that the "tag" field is NA - expect_true(is.na(result[1, "tag"]$tag)) -}) - -# Test 7: get_entities returns NA for the "tag" field when the input text is -# NA and show.text_id is TRUE -test_that("get_entities returns NA for the 'tag' field", { - result <- get_entities( - texts = NA, - doc_ids = NA, - show.text_id = TRUE, - gc.active = TRUE, - language = "ner" - ) - # Check that the "tag" field is NA - expect_true(is.na(result[1, "tag"]$tag)) -}) - -# Test 8: get_entities returns the correct entity tag "ORG" for an input text -test_that("get_entities returns the correct entity tag 'ORG' for an input text", { - result <- get_entities( - texts = "TCD in less better than Oxford", - doc_ids = "doc1", - show.text_id = TRUE, - gc.active = TRUE, - language = "ner" - ) - # Check that the entity tag is "ORG" - expect_equal(result[1, "tag"]$tag, "ORG") -}) diff --git a/tests/testthat/test_get_entities_batch.R b/tests/testthat/test_get_entities_batch.R deleted file mode 100644 index 001eda32..00000000 --- a/tests/testthat/test_get_entities_batch.R +++ /dev/null @@ -1,51 +0,0 @@ - -test_that("batch_texts and batch_doc_ids lengths mismatch", { - expect_error(get_entities_batch(texts = c("text1", "text2"), - doc_ids = "id1", - show.text_id = FALSE), - "The lengths of texts and doc_ids do not match.") -}) - -test_that("NA values for text or doc_id", { - result <- get_entities_batch(texts = c("text1", NA), - doc_ids = c("id1", "id2"), - show.text_id = FALSE) - expect_equal(typeof(result$doc_id[2]), "character") - expect_equal(result$entity[2], NA) - expect_equal(result$tag[2], NA) -}) - -test_that("No entities detected", { - # Assuming that the tagger returns no entities for "text_without_entity" - result <- get_entities_batch(texts = "text_without_entity", - doc_ids = "id1", - show.text_id = FALSE) - expect_equal(result$doc_id[1], "id1") - expect_equal(result$entity[1], NA) - expect_equal(result$tag[1], NA) -}) - -test_that("Inclusion of doc_id when show.text_id is TRUE", { - result <- get_entities_batch(texts = "text1", - doc_ids = "id1", - show.text_id = TRUE) - expect_true("text_id" %in% colnames(result)) - expect_equal(result$text_id[1], "text1") -}) - - -test_that("Mismatched lengths of batch_texts and batch_doc_ids raise an error", { - expect_error(get_entities_batch(c("Hello", "World"), "doc1", show.text_id = TRUE), - "The lengths of texts and doc_ids do not match.") -}) - -test_that("show.text_id = TRUE adds a text_id column", { - result <- get_entities_batch(c("Hello", "World"), c("doc1", "doc2"), show.text_id = TRUE) - expect_true("text_id" %in% names(result)) -}) - -test_that("show.text_id = FALSE does not add a text_id column", { - result <- get_entities_batch(c("Hello", "World"), c("doc1", "doc2"), show.text_id = TRUE) - expect_true("text_id" %in% colnames(result)) -}) - diff --git a/tests/testthat/test_get_pos.R b/tests/testthat/test_get_pos.R index 222158c3..8ba967a7 100644 --- a/tests/testthat/test_get_pos.R +++ b/tests/testthat/test_get_pos.R @@ -92,18 +92,19 @@ test_that("get_pos_batch throws an error for no valid texts provided.", { }) # Test 8: test loading tagger works as expected - test_that("loading tagger works as expected", { - # Assuming you have a valid tagger object for English - valid_tagger <- load_tagger_pos("pos-fast") + expected_messages <- c( + "Model name not specified. Using default 'pos-fast' model.", + "Loading POS tagger model: pos-fast" + ) - # tagger is NULL and no language is specified - expect_message(get_pos("Hello World", "doc1"), "Language is not specified. pos-fastin Flair is forceloaded. Please ensure that the internet connectivity is stable.") + messages <- capture_messages(get_pos("Hello World", "doc1")) - # tagger is NULL but a language is specified - expect_silent(get_pos("Hello World", "doc1", language = "pos")) - # - # a valid tagger object is passed - expect_silent(get_pos("Hello World", "doc1", tagger = valid_tagger)) + # 檢查每個期望的訊息是否都存在 + for(expected in expected_messages) { + expect_true( + any(grepl(expected, messages, fixed = TRUE)), + info = paste("Expected message not found:", expected) + ) + } }) - diff --git a/tests/testthat/test_get_pos_batch.R b/tests/testthat/test_get_pos_batch.R deleted file mode 100644 index 34d78eec..00000000 --- a/tests/testthat/test_get_pos_batch.R +++ /dev/null @@ -1,43 +0,0 @@ -# Test 1: get_pos_batch returns pos tags for three input texts using a custom tagger -test_that("get_pos_batch returns pos tags for three input texts using a custom tagger", { - result <- get_pos_batch( - texts = c("UCD is one of the best universities in Ireland.", - "Essex is famous in social science research", - "TCD is the oldest one in Ireland."), - doc_ids = c("doc1", "doc2", "doc3"), - tagger = load_tagger_pos(), - language = "upos-fast", - batch_size = 1 - ) - # Check that the number of rows in the result matches the number of tokens - expect_equal(nrow(result), 25) -}) - -# Test 2: get_pos_batch returns pos tags for three input texts using a custom tagger -test_that("texts and doc_ids are 0 and get_pos_batch returns NA", { - result <- get_pos_batch( - texts = "", - doc_ids = "", - tagger = load_tagger_pos(), - language = "upos-fast" , - batch_size = 1) - # Check that the number of rows in the result matches the number of tokens - expect_equal(nrow(result), 1) -}) - - -# Test 2: test loading tagger in get_pos_batch works as expected - -test_that("loading tagger works as expected", { - # Assuming you have a valid tagger object for English - valid_tagger <- load_tagger_pos("pos-fast") - - # Test 1: tagger is NULL and no language is specified - expect_message(get_pos_batch("Hello Dublin", batch_size = 1, doc_ids = "doc1"), "Language is not specified. pos-fastin Flair is forceloaded. Please ensure that the internet connectivity is stable.", fixed = FALSE) - - # Test 2: tagger is NULL but a language is specified - expect_equal(get_pos_batch(texts = "Hello Ireland", batch_size = 1, doc_ids = "doc1", language = "pos-fast")[1, "tag"]$tag, "UH") - # - # Test 3: a valid tagger object is passed - expect_message(get_pos_batch("Hello Colchester", batch_size = 1, doc_ids = "doc1", tagger = valid_tagger), "CPU is used.") -}) diff --git a/tests/testthat/test_get_sentiments.R b/tests/testthat/test_get_sentiments.R deleted file mode 100644 index 6a11cb77..00000000 --- a/tests/testthat/test_get_sentiments.R +++ /dev/null @@ -1,91 +0,0 @@ -# Test 1: get_sentiments returns sentiment scores for two input texts -test_that("get_sentiments returns sentiment scores for two input texts", { - result <- get_sentiments( - texts = c("UCD is one of the best universities in Ireland.", - "TCD is less better than Oxford."), - doc_ids = c("doc1", "doc2"), - language = "sentiment" - ) - # Check that the number of rows in the result matches the number of texts - expect_equal(nrow(result), 2) -}) - - -# Test 2: get_sentiments returns sentiment scores for two input texts -test_that("get_sentiments returns sentiment scores for two input texts", { - result <- get_sentiments_batch( - texts = c("UCD is one of the best universities in Ireland.", - "TCD is less better than Oxford."), - doc_ids = c("doc1", "doc2"), - language = "sentiment" - ) - # Check that the number of rows in the result matches the number of texts - expect_equal(nrow(result), 2) -}) - -# Test 3: get_sentiments returns sentiment scores using a custom tagger -test_that("get_sentiments returns sentiment scores using a custom tagger", { - result <- get_sentiments( - texts = c("UCD is one of the best universities in Ireland.", - "TCD is less better than Oxford."), - doc_ids = c("doc1", "doc2"), - tagger = load_tagger_sentiments(), - language = "sentiment" - ) - # Check that the number of rows in the result matches the number of texts - expect_equal(nrow(result), 2) -}) - -# Test 4: get_sentiments throws an error for mismatched lengths of texts and doc_ids -test_that("get_sentiments throws an error for mismatched lengths of texts and doc_ids", { - expect_error( - get_sentiments( - texts = "TCD in less better than Oxford", - doc_ids = c("doc1", "doc2"), - language = "sentiment" - ), - "The lengths of texts and doc_ids do not match." - ) -}) - -# Test 5: get_sentiments handles NA values and returns NA for sentiment scores -test_that("get_sentiments handles NA values and returns NA for sentiment scores", { - result <- get_sentiments( - texts = NA, - doc_ids = NA, - show.text_id = TRUE, - gc.active = TRUE, - language = "sentiment" - ) - # Check that the sentiment score is NA - expect_true(is.na(result[1, "score"]$score)) -}) - - -# Test 6: get_sentiments returns "The lengths of texts and doc_ids do not match. -test_that("get_sentiments returns The lengths of texts and doc_ids do not match.", { - expect_error(get_sentiments( - texts = "TCD in less better than Oxford", "Essex is in Colchester", - doc_ids = c("doc1", "doc2"), - show.text_id = TRUE, - gc.active = TRUE, - language = "sentiment" - ) - , "The lengths of texts and doc_ids do not match.") -}) - -# Test 7: get_sentiments with empty input returns NA for score -test_that("get_sentiments with empty input returns NA for score", { - # Call get_sentiments with empty input - result <- get_sentiments( - texts = "", - doc_ids = "", - show.text_id = TRUE, - gc.active = TRUE, - language = "sentiment" - ) - # Check that the result has one row and the "score" column is NA - expect_equal(nrow(result), 1) - expect_true(is.na(result$score)) -}) - diff --git a/tests/testthat/test_get_sentiments_batch.R b/tests/testthat/test_get_sentiments_batch.R deleted file mode 100644 index cef8fa5c..00000000 --- a/tests/testthat/test_get_sentiments_batch.R +++ /dev/null @@ -1,37 +0,0 @@ -# Test 1: Ensure the length of texts and doc_ids are the same. - -test_that("Ensure the length of texts and doc_ids are the same", { - expect_error(get_sentiments_batch(texts = c("text1", "text2"), - doc_ids = "id1"), - "The lengths of texts and doc_ids do not match.") -}) - -# Test 2: Check if texts is empty after removing NAs and empty texts -test_that("Check if texts is empty after removing NAs and empty texts", { - expect_message(get_sentiments_batch(texts = c("", NA, NA), - doc_ids = c("id1", "id2", "id3")), - "CPU is used.", fixed = FALSE) -}) - -# Test 3: Check sentiment and score for specific input text - -test_that("Check sentiment and score for specific input text", { - result <- get_sentiments_batch(texts = "some_text_without_sentiment", - doc_ids = "id1") - - expect_equal(result$sentiment[1], "NEGATIVE") - expect_equal(result$score[1], 0.9968621, tolerance = 0.0001) -}) - -# Test 4. text_id` is added to the result only if `show.text_id` is TRUE -test_that("`text_id` is added to the result only if `show.text_id` is TRUE", { - result_with_text_id <- get_sentiments_batch(texts = "some_text", - doc_ids = "id1", - show.text_id = TRUE) - expect_true("text_id" %in% names(result_with_text_id)) - - result_without_text_id <- get_sentiments_batch(texts = "some_text", - doc_ids = "id1", - show.text_id = FALSE) - expect_false("text_id" %in% names(result_without_text_id)) -}) diff --git a/tests/testthat/test_util.R b/tests/testthat/test_util.R index 53855e95..9a8f2b56 100644 --- a/tests/testthat/test_util.R +++ b/tests/testthat/test_util.R @@ -1,13 +1,13 @@ # Test 1: Ensure that `get_entities` provides the expected row count when # provided with specific texts, doc_ids, and a pre-loaded tagger. -test_that("get_entities throws an error for unsupported languages under no internet condition", { - expect_equal(nrow(get_entities( - texts = c("UCD is one of the best university in Ireland. ", - "TCD in less better than Oxford"), - doc_ids= c("doc1", "doc2"), load_tagger_ner("en"))), - 4) -}) - +# test_that("get_entities throws an error for unsupported languages under no internet condition", { +# expect_equal(nrow(get_entities( +# texts = c("UCD is one of the best university in Ireland. ", +# "TCD in less better than Oxford"), +# doc_ids= c("doc1", "doc2"), load_tagger_ner("en"))), +# 4) +# }) +# # Test 2: Similar test as above, but without explicitly specifying the language # for the tagger. This tests the default behavior. diff --git a/vignettes/tutorial.Rmd b/vignettes/tutorial.Rmd index 2ead3dfa..5a8fdefd 100644 --- a/vignettes/tutorial.Rmd +++ b/vignettes/tutorial.Rmd @@ -37,6 +37,8 @@ reticulate::py_install("flair") In addition, Flair NLP offers pre-trained models for various languages and tasks, and is compatible with fine-tuned transformers hosted on Hugging Face. + + - [**Sentence and Token Object**](#tutorial.html#sentence-and-token) - [**Sequence Taggings**](#tutorial.html#sequence-taggings) @@ -53,10 +55,6 @@ In addition, Flair NLP offers pre-trained models for various languages and tasks - [**Extending conText's Embedding Regression**](#tutorial.html#extending-contexts-embedding-regression) - - - - ------------------------------------------------------------------------ # Sentence and Token @@ -86,7 +84,6 @@ sentence <- Sentence(string) print(sentence) ``` - ## **Token** @@ -114,8 +111,8 @@ print(sentence$tokens) **Retrieve the Token** -
+
To comprehend the string representation format of the Sentence object, tagging at least one token is adequate. Python's `get_token(n)` method allows us to retrieve the Token object for a particular token. Additionally, we can use **`[]`** to index a specific token.
@@ -129,17 +126,17 @@ sentence$get_token(5) # indexing in R sentence[6] ``` -
+
Each word (and punctuation) in the text is treated as an individual Token object. These Token objects store text information and other possible linguistic information (such as part-of-speech tags or named entity tags) and embedding (if you used a model to generate them). While you do not need to create Token objects manually, understanding how to manage them is useful in situations where you might want to fine-tune the tokenization process. For example, you can control the exactness of tokenization by manually creating Token objects from a Sentence object. This makes Flair very flexible when handling text data since the automatic tokenization feature can be used for rapid development, while also allowing users to fine-tune their tokenization. -
**Annotate POS tag and NER tag** +
The `add_label(label_type, value)` method can be employed to assign a label to the token. In Universal POS tags, if `sentence[10]` is 'see', 'seen' might be tagged as `VERB`, indicating it is a past participle form of a verb.
@@ -165,9 +162,7 @@ print(sentence[4]) ```
- If we print the sentence object, `Sentence[50]` provides information for 50 tokens → ['in'/ORG, 'seen'/VERB], thus displaying two tagging pieces of information. -
```{r} @@ -180,9 +175,7 @@ print(sentence) ## **Corpus**
- The Corpus object in Flair is a fundamental data structure that represents a dataset containing text samples, usually comprising of a training set, a development set (or validation set), and a test set. It's designed to work smoothly with Flair's models for tasks like named entity recognition, text classification, and more. -
**Attributes:** @@ -192,7 +185,6 @@ The Corpus object in Flair is a fundamental data structure that represents a dat - `train`: A list of sentences (List[Sentence]) that form the training dataset. - `dev` (or development): A list of sentences (List[Sentence]) that form the development (or validation) dataset. - `test`: A list of sentences (List[Sentence]) that form the test dataset. -
**Important Methods:** @@ -331,7 +323,6 @@ print(sentence) ```
- Use a for loop to print out each POS tag. It's important to note that Python is indexed from 0. Therefore, in an R environment, we must use `seq_along(sentence$get_labels()) - 1`.
@@ -342,12 +333,10 @@ for (i in seq_along(sentence$get_labels())) { } ``` -
## Tag Part-of-Speech
- We use flaiR/POS-english for POS tagging in the standard models on Hugging Face.
@@ -421,11 +410,9 @@ for (i in seq_along(sentence$get_labels())) { ## Detect Sentiment
- Let's run sentiment analysis over the same sentence to determine whether it is POSITIVE or NEGATIVE. You can do this with essentially the same code as above. Instead of loading the 'ner' model, you now load the 'sentiment' model: -
```{r} @@ -452,17 +439,13 @@ print(sentence) ------------------------------------------------------------------------ - - ## Dealing with Dataframe ### Parts-of-Speech Tagging Across Full DataFrame
- You can apply Part-of-Speech (POS) tagging across an entire DataFrame using Flair's pre-trained models. Let's walk through an example using the pos-fast model. You can apply Part-of-Speech (POS) tagging across an entire DataFrame using Flair's pre-trained models. Let's walk through an example using the pos-fast model. First, let's load our required packages and sample data: -
```{r} @@ -472,24 +455,24 @@ uk_immigration <- uk_immigration[1:2,] ```
- For POS tagging, we'll use Flair's pre-trained model. The pos-fast model offers a good balance between speed and accuracy. For more pre-trained models, check out Flair's documentation at Flair POS Tagging Documentation. There are two ways to load the POS tagger: +
+ - Load with tag dictionary display (default): - ```{r} tagger_pos <- load_tagger_pos("pos-fast") ``` + This will show you all available POS tags grouped by categories (nouns, verbs, adjectives, etc.). - Load without tag display for a cleaner output: + ```{r} -# 載入但不顯示標籤 -pos_tagger <- load_tagger_pos("pos-fast", - show_tags = FALSE) +pos_tagger <- load_tagger_pos("pos-fast", show_tags = FALSE) ``` Now we can process our texts: @@ -503,8 +486,6 @@ results <- get_pos(texts = uk_immigration$text, head(results, n = 10) ``` - - ### Tagging Entities Across Full DataFrame
@@ -514,9 +495,7 @@ This section focuses on performing Named Entity Recognition (NER) on data stored I load the flaiR package and use the built-in uk_immigration dataset. For demonstration purposes, I'm only taking the first two rows. This dataset contains discussions about immigration in the UK. - Load the pre-trained model `ner`. For more pre-trained models, see . -
@@ -525,10 +504,9 @@ library(flaiR) data(uk_immigration) uk_immigration <- head(uk_immigration, n = 2) ``` -
+
Next, I load the latest model hosted and maintained on Hugging Face by the Flair NLP team. For more Flair NER models, you can visit the official Flair NLP page on Hugging Face (https://huggingface.co/flair). -
```{r} @@ -539,38 +517,12 @@ library(flaiR) tagger_ner <- load_tagger_ner("flair/ner-english-ontonotes") ``` - - - - - - - - - - - - - - - - - - - - - - - - -
- I load a pre-trained NER model. Since I'm using a Mac M1/M2, I set the model to run on the MPS device for faster processing. If I want to use other pre-trained models, I can check the Flair documentation website for available options. -
Now I'm ready to process the text: + ```{r} results <- get_entities(texts = uk_immigration$text, doc_ids = uk_immigration$speaker, @@ -581,143 +533,23 @@ head(results, n = 10) ``` - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ------------------------------------------------------------------------ # Embedding
- Flair is a very popular natural language processing library, providing a variety of embedding methods for text representation. Flair Embeddings is a word embedding framework developed by [Zalando](https://engineering.zalando.com/posts/2018/11/zalando-research-releases-flair.html). It focuses on word-level representation and can capture contextual information of words, allowing the same word to have different embeddings in different contexts. Unlike traditional word embeddings (such as Word2Vec or GloVe), Flair can dynamically generate word embeddings based on context and has achieved excellent results in various NLP tasks. Below are some key points about Flair Embeddings:
**Context-Aware** -
+
Flair is a dynamic word embedding technique that can understand the meaning of words based on context. In contrast, static word embeddings, such as Word2Vec or GloVe, provide a fixed embedding for each word without considering its context in a sentence. Therefore, context-sensitive embedding techniques, such as Flair, can capture the meaning of words in specific sentences more accurately, thus enhancing the performance of language models in various tasks. -Example: +__Example:__ Consider the following two English sentences: @@ -753,11 +585,9 @@ bank_embedding_sentence2 = sentence2[6]$embedding # "bank" is the sixth word ```
- Same word, similar vector representation, but essentially different. In this way, you can see how the dynamic embeddings for "bank" in the two sentences differ based on context. Although we printed the embeddings here, in reality, they would be high-dimensional vectors, so you might see a lot of numbers. If you want a more intuitive view of the differences, you could compute the cosine similarity or other metrics between the two embeddings. This is just a simple demonstration. In practice, you can also combine multiple embedding techniques, such as `WordEmbeddings` and `FlairEmbeddings`, to get richer word vectors. -
```{r} @@ -769,22 +599,19 @@ cosine(as.numeric( bank_embedding_sentence1$numpy()), **Character-Based**
- Flair uses a character-level language model, meaning it can generate embeddings for rare words or even misspelled words. This is an important feature because it allows the model to understand and process words that have never appeared in the training data. Flair uses a bidirectional LSTM (Long Short-Term Memory) network that operates at a character level. This allows it to feed individual characters into the LSTM instead of words.
**Multilingual Support** -
+
Flair provides various pre-trained character-level language models, supporting contextual word embeddings for multiple languages. It allows you to easily combine different word embeddings (e.g., Flair Embeddings, Word2Vec, GloVe, etc.) to create powerful stacked embeddings. -
## Classic Wordembeddings
- In Flair, the simplest form of embeddings that still contains semantic information about the word are called classic word embeddings. These embeddings are pre-trained and non-contextual. Let's retrieve a few word embeddings and use FastText embeddings with the following code. To do so, we simply instantiate a WordEmbeddings class by passing in the ID of the embedding of our choice. Then, we simply wrap our text into a Sentence object, and call the `embed(sentence)` method on our WordEmbeddings class. @@ -805,9 +632,7 @@ for (i in seq_along(sentence$tokens)) { ```
- Flair supports a range of classic word embeddings, each offering unique features and application scopes. Below is an overview, detailing the ID required to load each embedding and its corresponding language. -
| Embedding Type | ID | Language | @@ -855,11 +680,9 @@ Flair supports a range of classic word embeddings, each offering unique features ## Contexual Embeddings
- The idea behind contextual string embeddings is that each word embedding should be defined by not only its syntactic-semantic meaning but also the context it appears in. What this means is that each word will have a different embedding for every context it appears in. Each pre-trained Flair model offers a **forward** version and a **backward** version. Let's assume you are processing a language that, just like this text, uses the left-to-right script. The forward version takes into account the context that happens before the word -- on the left-hand side. The backward version works in the opposite direction. It takes into account the context after the word -- on the right-hand side of the word. If this is true, then two same words that appear at the beginning of two different sentences should have identical forward embeddings, because their context is null. Let's test this out: Because we are using a forward model, it only takes into account the context that occurs before a word. Additionally, since our word has no context on the left-hand side of its position in the sentence, the two embeddings are identical, and the code assumes they are identical, indeed output is **True**. -
@@ -875,9 +698,7 @@ cat(" s1 sentence:", paste(s1[0], sep = ""), "\n", "s2 sentence:", paste(s2[0], ```
- We test whether the sum of the two 2048 embeddings of `nice` is equal to 2048. If it is true, it indicates that the embedding results are consistent, which should theoretically be the case. -
@@ -902,12 +723,9 @@ length(s1[0]$embedding$numpy()) == sum(s1[0]$embedding$numpy() == s2[0]$embeddi ```
- The measure of similarity between two vectors in an inner product space is known as cosine similarity. The formula for calculating cosine similarity between two vectors, such as vectors A and B, is as follows: $Cosine Similarity = \frac{\sum_{i} (A_i \cdot B_i)}{\sqrt{\sum_{i} (A_i^2)} \cdot \sqrt{\sum_{i} (B_i^2)}}$ - -
```{r} @@ -924,16 +742,12 @@ print(cosine_similarity) ``` - - ------------------------------------------------------------------------ ## Extracting Embeddings from BERT
- First, we utilize the `flair.embeddings.TransformerWordEmbeddings` function to download BERT, and more transformer models can also be found on [Flair NLP's Hugging Face](https://huggingface.co/flair). -
```{r} @@ -1004,42 +818,8 @@ word_embeddings_matrix <- as.data.frame(pca_result$x[,1:3]) word_embeddings_matrix ``` - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #### 2D Plot + ```{r, out.width="95%" } library(ggplot2) glove_plot2D <- ggplot(word_embeddings_matrix, aes(x = PC1, y = PC2, color = PC3, @@ -1121,66 +901,6 @@ word_embeddings_matrix <- as.data.frame(pca_result$x[,1:3]) word_embeddings_matrix ``` - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ```{r, out.width="95%" } # 2D Plot library(ggplot2) @@ -1233,30 +953,6 @@ word_embeddings_matrix <- as.data.frame(pca_result$x[,1:3]) word_embeddings_matrix ``` - - - - - - - - - - - - - - - - - - - - - - - - ```{r, out.width="95%" } # 2D Plot library(ggplot2) @@ -1345,7 +1041,6 @@ label_dict = corpus$make_label_dictionary(label_type=lbl_type) ## Loading the Embeddings
- flaiR covers all the different types of document embeddings that we can use. Here, we simply use `DocumentPoolEmbeddings`. They require no training prior to training the classification model itself:
@@ -1370,9 +1065,7 @@ classifier <- TextClassifier(document_embeddings, ```
- `$to` allows you to set the device to use CPU, GPU, or specific MPS devices on Mac (such as mps:0, mps:1, mps:2). -
```{r eval=FALSE, include=TRUE} @@ -1402,14 +1095,12 @@ TextClassifier( ## Training the Model
- Training the text classifier model involves two simple steps: - Defining the model trainer class by passing in the classifier model and the corpus - Setting off the training process passing in the required training hyper-parameters. **It is worth noting that the 'L' in numbers like 32L and 5L is used in R to denote that the number is an integer. Without the 'L' suffix, numbers in R are treated as numeric, which are by default double-precision floating-point numbers. In contrast, Python determines the type based on the value of the number itself. Whole numbers (e.g., 5 or 32) are of type int, while numbers with decimal points (e.g., 5.0) are of type float. Floating-point numbers in both languages are representations of real numbers but can have some approximation due to the way they are stored in memory.** -
@@ -1435,9 +1126,7 @@ trainer$train('classifier', ## Loading and Using the Classifiers
- After training the text classification model, the resulting classifier will already be stored in memory as part of the classifier variable. It is possible, however, that your Python session exited after training. If so, you'll need to load the model into memory with the following: -
```{r} @@ -1469,14 +1158,11 @@ print(sentence$labels) # Training RNNs
- Here, we train a sentiment analysis model to categorize text. In this case, we also include a pipeline that implements the use of Recurrent Neural Networks (RNN). This makes them particularly effective for tasks involving sequential data. This section also show you how to implement one of most powerful features in flaiR, stacked embeddings. You can stack multiple embeddings with different layers and let the classifier learn from different types of features. In Flair NLP, and with the **flaiR** package, it's very easy to accomplish this task.
## Import Necessary Modules - - ```{r} library(flaiR) WordEmbeddings <- flair_embeddings()$WordEmbeddings @@ -1490,9 +1176,7 @@ ModelTrainer <- flair_trainers()$ModelTrainer ## Get the IMDB Corpus
- The IMDB movie review dataset is used here, which is a commonly utilized dataset for sentiment analysis. `$downsample(0.1)` method means only 10% of the dataset is used, allowing for a faster demonstration. -
@@ -1508,7 +1192,6 @@ label_dict <- corpus$make_label_dictionary(label_type=lbl_type) ## Stacked Embeddings
- This is one of Flair's most powerful features: it allows for the integration of embeddings to enable the model to learn from more sparse features. Three types of embeddings are utilized here: GloVe embeddings, and two types of Flair embeddings (forward and backward). Word embeddings are used to convert words into vectors.
@@ -1572,7 +1255,6 @@ print(sentence.labels)
We use data from *The Temporal Focus of Campaign Communication (2020 JOP)* as an example. Let's assume we receive the data for training from different times. First, suppose you have a dataset of 1000 entries called `cc_muller_old`. On another day, with the help of nice friends, you receive another set of data, adding 2000 entries in a dataset called `cc_muller_new`. Both subsets are from `data(cc_muller)`. We will show how to fine-tune a transformer model with `cc_muller_old`, and then continue with another round of fine-tuning using `cc_muller_new`. -
```{r} @@ -1606,9 +1288,7 @@ ModelTrainer <- flair_trainers()$ModelTrainer ```
- We use purrr to help us split sentences using Sentence from `flair_data()`, then use map2 to add labels, and finally use `Corpus` to segment the data. -
@@ -1644,9 +1324,7 @@ old_dev <- old_test[!test_id] ```
- If you do not provide a development set (dev set) while using Flair, it will automatically split the training data into training and development datasets. The test set is used for training the model and evaluating its final performance, whereas the development set is used for adjusting model parameters and preventing overfitting, or in other words, for early stopping of the model. -
@@ -1661,9 +1339,7 @@ document_embeddings <- TransformerDocumentEmbeddings('distilbert-base-uncased', ```
- First, the `$make_label_dictionary` function is used to automatically create a label dictionary for the classification task. The label dictionary is a mapping from label to index, which is used to map the labels to a tensor of label indices. Besides classification tasks, flaiR also supports other label types for training custom model. From the cc_muller dataset: Future (seen 423 times), Present (seen 262 times), Past (seen 131 times). -
@@ -1672,9 +1348,7 @@ old_label_dict <- old_corpus$make_label_dictionary(label_type="classification") ```
- `TextClassifier` is used to create a text classifier. The classifier takes the document embeddings (importing from `'distilbert-base-uncased'` from Hugging Face) and the label dictionary as input. The label type is also specified as classification. -
```{r} @@ -1769,7 +1443,6 @@ More R tutorial and documentation see [here](https://github.com/davidycliao/flai # Extending conText's Embedding Regression
- `ConText` is a fast, flexible, and transparent framework for estimating context-specific word and short document embeddings using [the 'a la carte' embeddings regression](https://github.com/prodriguezsosa/EmbeddingRegression), implemented by Rodriguez et al. In this case study, I'll demonstrate how to use the conText package alongside other embedding frameworks by working through the example provided in Rodriguez et al.'s [Quick Start Guide](https://github.com/prodriguezsosa/conText/blob/master/vignettes/quickstart.md). While ConText includes its own cross-lingual ALC Embeddings, this tutorial extends its capabilities by integrating it with flaiR. Through this tutorial integration, this tutorial shows how to: @@ -1786,7 +1459,6 @@ First of all, when loading the conText package, you'll find three pre-loaded dat - Flair NLP contextual embeddings (as described in [Akbik et al., COLING 2018 paper](https://flairnlp.github.io/docs/tutorial-embeddings/flair-embeddings)) - Integrated embeddings extracted from transformers like BERT. -
## Build Document-Embedding-Matrix with Other Embedding Frameworks @@ -1860,7 +1532,6 @@ fasttext_embeddings$embed(sentence) ```
- The `process_embeddings` function from flaiR extracts pre-embedded GloVe vectors from a sentence object and arranges them into a structured matrix. In this matrix, tokens are represented as rows, embedding dimensions as columns, and each row is labeled with its corresponding token text.
@@ -1871,7 +1542,6 @@ fasttext_subset <- process_embeddings(sentence, verbose = TRUE)
Create a feature co-occurrence matrix (FCM) from tokenized text and transform pre-trained FastText embeddings using co-occurrence information. -
```{r} @@ -1890,7 +1560,7 @@ ft_transform <- compute_transform( ) ``` -Calculate Document Embedding Matrix (DEM) using transformed FastText embeddings +Calculate Document Embedding Matrix (DEM) using transformed FastText embeddings. ```{r} # Calculate Document Embedding Matrix (DEM) using transformed FastText embeddings @@ -2068,12 +1738,10 @@ alc_model <- conText(formula = immigration ~ party + gender, verbose = FALSE) ``` -#### Document Embedding Matrix Construction Using Flair Contextual Stacked Embeddings +### Document Embedding Matrix Construction Using Flair Contextual Stacked Embeddings
- To facilitate the loading of different embedding types, I'll import the following __classes__ and __functions__ from flaiF: `WordEmbeddings`, `FlairEmbeddings`, `TransformerWordEmbeddings`, `StackedEmbeddings`, and `Sentence.` These components enable us to work with GloVe embeddings, Flair's contextual embeddings, and transformer-based embeddings from the HuggingFace library. -
```{r include=FALSE} @@ -2101,7 +1769,6 @@ flair_backward <- FlairEmbeddings('news-backward') ```
- Combine three different types of embeddings into a stacked embedding model. This creates a stacked embedding model that combines: @@ -2109,7 +1776,6 @@ This creates a stacked embedding model that combines: - `FastText embeddings`: Captures general word semantics - `Forward Flair`: Captures contextual information reading text left-to-right - `Backward Flair`: Captures contextual information reading text right-to-left -
```{r} @@ -2166,10 +1832,9 @@ st_model <- conText(formula = immigration ~ party + gender, verbose = FALSE) ``` -#### Document Embedding Matrix Construction with BERT +### Document Embedding Matrix Construction with BERT
- BERT embeddings provide powerful contextual representations through their bidirectional transformer architecture. These embeddings are good at understanding context from both directions within text, generating deep contextual representations through multiple transformer layers, and leveraging pre-training on large text corpora to achieve strong performance across NLP tasks. The classic BERT base model generates 768-dimensional embeddings for each token, providing rich semantic representations. @@ -2228,10 +1893,9 @@ bt_model <- conText(formula = immigration ~ party + gender, verbose = FALSE) ``` -#### Comparision of Different Embedding Approaches +### Comparision of Different Embedding Approaches
- While this tutorial doesn't determine a definitive "best" approach, it's important to understand the key distinctions between word embedding methods. BERT, FastText, Flair Stacked Embeddings, and GloVe can be categorized into two groups: dynamic and static embeddings. Dynamic embeddings, such as BERT and Flair, adapt their word representations based on context using high-dimensional vector spaces (BERT uses 768 dimensions in its base model). These methods excel at handling contextual nuances: BERT employs self-attention mechanisms and subword tokenization, while Flair uses character-level modeling. Both effectively handle out-of-vocabulary words through these mechanisms. From a4a2350487c3aeacc759ff769572120904f4e681 Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sat, 28 Dec 2024 15:06:17 +0000 Subject: [PATCH 16/46] UPDATE 0.0.7 --- tests/testthat/test_get_entities.R | 90 ++++++++++++++ vignettes/get_entities.Rmd | 1 - vignettes/tutorial.Rmd | 188 ++++++++++++++++++++--------- 3 files changed, 221 insertions(+), 58 deletions(-) diff --git a/tests/testthat/test_get_entities.R b/tests/testthat/test_get_entities.R index 8b137891..decc11b8 100644 --- a/tests/testthat/test_get_entities.R +++ b/tests/testthat/test_get_entities.R @@ -1 +1,91 @@ +test_that("get_entities works with standard NER model", { + # 設置測試數據 + texts <- c( + "John Smith works at Google in New York.", + "The Eiffel Tower was built in 1889." + ) + doc_ids <- c("doc1", "doc2") + + # 加載標準 NER 模型 + tagger_std <- load_tagger_ner('ner') + + # 基本功能測試 + results <- get_entities( + texts = texts, + doc_ids = doc_ids, + tagger = tagger_std, + batch_size = 2 + ) + + # 測試返回值結構 + expect_true(is.data.frame(results)) + + # 測試提取的實體 + expect_true(any(grepl("John Smith", results$entity))) + expect_true(any(grepl("Google", results$entity))) + expect_true(any(grepl("New York", results$entity))) + + expected_tags <- c("PER", "ORG", "LOC") + expect_true(all(results$tag[results$tag != "O"] %in% expected_tags)) +}) + +test_that("get_entities handles different parameters correctly", { + + text <- "John lives in Berlin." + tagger_std <- load_tagger_ner('ner') + + result_with_text <- get_entities( + texts = text, + doc_ids = 1, + tagger = tagger_std, + show.text_id = TRUE + ) + expect_true("text_id" %in% names(result_with_text)) + + # 測試批次大小 + result_batch <- get_entities( + texts = rep(text, 3), + doc_ids = c(1, 2, 3), + tagger = tagger_std, + batch_size = 2 + ) + expect_true(nrow(result_batch) >= 2) + + result_with_ids <- get_entities( + texts = text, + doc_ids = "doc1", + tagger = tagger_std + ) + expect_equal(unique(result_with_ids$doc_id), "doc1") +}) + +test_that("get_entities error handling", { + tagger_std <- load_tagger_ner('ner') + + # 測試空文本 + expect_error( + get_entities(texts = character(0), tagger = tagger_std), + "The texts cannot be NULL or empty." + ) + + # 測試 texts 和 doc_ids 長度不匹配 + expect_error( + get_entities( + texts = c("text1", "text2"), + doc_ids = "doc1", + tagger = tagger_std + ), + "The lengths of texts and doc_ids do not match." + ) + + # 測試無效的批次大小 + expect_error( + get_entities( + texts = "text", + tagger = tagger_std, + batch_size = 0 + ), + "Invalid batch size. It must be a positive integer." + ) +}) diff --git a/vignettes/get_entities.Rmd b/vignettes/get_entities.Rmd index a8b2200d..45d42b53 100644 --- a/vignettes/get_entities.Rmd +++ b/vignettes/get_entities.Rmd @@ -23,7 +23,6 @@ system(paste(Sys.which("python3"), "-m pip install --upgrade pip")) system(paste(Sys.which("python3"), "-m pip install torch")) system(paste(Sys.which("python3"), "-m pip install flair")) library(reticulate) -library(flaiR) # # system(paste(reticulate::py_config()$python, "-m pip install flair")) # reticulate::py_install("flair") ``` diff --git a/vignettes/tutorial.Rmd b/vignettes/tutorial.Rmd index 5a8fdefd..2761a2cc 100644 --- a/vignettes/tutorial.Rmd +++ b/vignettes/tutorial.Rmd @@ -92,6 +92,7 @@ print(sentence) When you use Flair to handle text data,[^1] `Sentence` and `Token` objects often play central roles in many use cases. When you create a Sentence object, it automatically tokenizes the text, removing the need to create the Token object manually. Unlike R, which indexes from 1, Python indexes from 0. Therefore, when using a for loop, I use `seq_along(sentence) - 1`. The output should be something like: +
```{r} @@ -113,7 +114,9 @@ print(sentence$tokens) **Retrieve the Token**
+ To comprehend the string representation format of the Sentence object, tagging at least one token is adequate. Python's `get_token(n)` method allows us to retrieve the Token object for a particular token. Additionally, we can use **`[]`** to index a specific token. +
@@ -128,17 +131,21 @@ sentence[6] ```
+ Each word (and punctuation) in the text is treated as an individual Token object. These Token objects store text information and other possible linguistic information (such as part-of-speech tags or named entity tags) and embedding (if you used a model to generate them). While you do not need to create Token objects manually, understanding how to manage them is useful in situations where you might want to fine-tune the tokenization process. For example, you can control the exactness of tokenization by manually creating Token objects from a Sentence object. This makes Flair very flexible when handling text data since the automatic tokenization feature can be used for rapid development, while also allowing users to fine-tune their tokenization. +
**Annotate POS tag and NER tag**
+ The `add_label(label_type, value)` method can be employed to assign a label to the token. In Universal POS tags, if `sentence[10]` is 'see', 'seen' might be tagged as `VERB`, indicating it is a past participle form of a verb. +
```{r} @@ -150,7 +157,9 @@ print(sentence[10]) ```
+ We can also add a NER (Named Entity Recognition) tag to `sentence[4]`, "UCD", identifying it as a university in Dublin. +
```{r} @@ -162,7 +171,9 @@ print(sentence[4]) ```
+ If we print the sentence object, `Sentence[50]` provides information for 50 tokens → ['in'/ORG, 'seen'/VERB], thus displaying two tagging pieces of information. +
```{r} @@ -174,28 +185,19 @@ print(sentence) ## **Corpus** -
The Corpus object in Flair is a fundamental data structure that represents a dataset containing text samples, usually comprising of a training set, a development set (or validation set), and a test set. It's designed to work smoothly with Flair's models for tasks like named entity recognition, text classification, and more. -
**Attributes:** -
- - `train`: A list of sentences (List[Sentence]) that form the training dataset. - `dev` (or development): A list of sentences (List[Sentence]) that form the development (or validation) dataset. - `test`: A list of sentences (List[Sentence]) that form the test dataset. -
**Important Methods:** -
- -- `downsample`: This method allows you to downsample (reduce) the number of sentences in the train, dev, and test splits. -- `obtain_statistics`: This method gives a quick overview of the statistics of the corpus, including the number of sentences and the distribution of labels. -- `make_vocab_dictionary`: Used to create a vocabulary dictionary from the corpus. - -
+- `downsample`: This method allows you to downsample (reduce) the number of sentences in the train, dev, and test splits. +- `obtain_statistics`: This method gives a quick overview of the statistics of the corpus, including the number of sentences and the distribution of labels. +- `make_vocab_dictionary`: Used to create a vocabulary dictionary from the corpus. ```{r} library(flaiR) @@ -233,6 +235,7 @@ print(formatted_str) Below, we use data from the article [*The Temporal Focus of Campaign Communication*](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwjz_bS3p5KCAxWEWEEAHcuVAi4QFnoECA8QAQ&url=https%3A%2F%2Fwww.journals.uchicago.edu%2Fdoi%2Ffull%2F10.1086%2F715165&usg=AOvVaw3f_J3sXTrym2ZR64pF3ZtN&opi=89978449) by [Stefan Muller](https://muellerstefan.net), published in the *Journal of Politics* in 2020, as an example. First, we vectorize the `cc_muller$text` using the Sentence function to transform it into a list object. Then, we reformat `cc_muller$class_pro_retro` as a factor. It's essential to note that R handles numerical values differently than Python. In R, numerical values are represented with a floating point, so it's advisable to convert them into factors or strings. Lastly, we employ the map function from the purrr package to assign labels to each sentence corpus using the `$add_label` method. +
```{r} @@ -323,7 +326,9 @@ print(sentence) ```
+ Use a for loop to print out each POS tag. It's important to note that Python is indexed from 0. Therefore, in an R environment, we must use `seq_along(sentence$get_labels()) - 1`. +
@@ -337,7 +342,9 @@ for (i in seq_along(sentence$get_labels())) { ## Tag Part-of-Speech
-We use flaiR/POS-english for POS tagging in the standard models on Hugging Face. + +We use `flaiR/POS-english` for POS tagging in the standard models on Hugging Face. +
```{r} @@ -354,7 +361,7 @@ tagger <- Classifier$load('pos') ``` -Penn Treebank POS Tags Reference +_Penn Treebank POS Tags Reference_ | Tag | Description | Example | @@ -407,12 +414,15 @@ for (i in seq_along(sentence$get_labels())) { print(sentence$get_labels()[[i]]) } ``` + ## Detect Sentiment
+ Let's run sentiment analysis over the same sentence to determine whether it is POSITIVE or NEGATIVE. -You can do this with essentially the same code as above. Instead of loading the 'ner' model, you now load the 'sentiment' model: +You can do this with essentially the same code as above. Instead of loading the 'ner' model, you now load the `'sentiment'` model: +
```{r} @@ -444,8 +454,10 @@ print(sentence) ### Parts-of-Speech Tagging Across Full DataFrame
+ You can apply Part-of-Speech (POS) tagging across an entire DataFrame using Flair's pre-trained models. Let's walk through an example using the pos-fast model. You can apply Part-of-Speech (POS) tagging across an entire DataFrame using Flair's pre-trained models. Let's walk through an example using the pos-fast model. First, let's load our required packages and sample data: +
```{r} @@ -455,7 +467,9 @@ uk_immigration <- uk_immigration[1:2,] ```
+ For POS tagging, we'll use Flair's pre-trained model. The pos-fast model offers a good balance between speed and accuracy. For more pre-trained models, check out Flair's documentation at Flair POS Tagging Documentation. There are two ways to load the POS tagger: +
@@ -492,10 +506,10 @@ head(results, n = 10) This section focuses on performing Named Entity Recognition (NER) on data stored in a dataframe format. My goal is to identify and tag named entities within text that is organized in a structured dataframe. - I load the flaiR package and use the built-in uk_immigration dataset. For demonstration purposes, I'm only taking the first two rows. This dataset contains discussions about immigration in the UK. Load the pre-trained model `ner`. For more pre-trained models, see . +
@@ -506,7 +520,9 @@ uk_immigration <- head(uk_immigration, n = 2) ```
+ Next, I load the latest model hosted and maintained on Hugging Face by the Flair NLP team. For more Flair NER models, you can visit the official Flair NLP page on Hugging Face (https://huggingface.co/flair). +
```{r} @@ -518,7 +534,9 @@ tagger_ner <- load_tagger_ner("flair/ner-english-ontonotes") ```
+ I load a pre-trained NER model. Since I'm using a Mac M1/M2, I set the model to run on the MPS device for faster processing. If I want to use other pre-trained models, I can check the Flair documentation website for available options. +
Now I'm ready to process the text: @@ -538,19 +556,26 @@ head(results, n = 10) # Embedding
+ Flair is a very popular natural language processing library, providing a variety of embedding methods for text representation. Flair Embeddings is a word embedding framework developed by [Zalando](https://engineering.zalando.com/posts/2018/11/zalando-research-releases-flair.html). It focuses on word-level representation and can capture contextual information of words, allowing the same word to have different embeddings in different contexts. Unlike traditional word embeddings (such as Word2Vec or GloVe), Flair can dynamically generate word embeddings based on context and has achieved excellent results in various NLP tasks. Below are some key points about Flair Embeddings: -
+
**Context-Aware**
+ Flair is a dynamic word embedding technique that can understand the meaning of words based on context. In contrast, static word embeddings, such as Word2Vec or GloVe, provide a fixed embedding for each word without considering its context in a sentence. Therefore, context-sensitive embedding techniques, such as Flair, can capture the meaning of words in specific sentences more accurately, thus enhancing the performance of language models in various tasks. +
+ + __Example:__ +
+ Consider the following two English sentences: - "I am interested in the bank of the river." @@ -562,12 +587,11 @@ For static embeddings, the word "bank" might have an embedding that lies somewhe
+ ```{r} +# Initialize Flair embeddings FlairEmbeddings <- flair_embeddings()$FlairEmbeddings Sentence <- flair_data()$Sentence - - -# Initialize Flair embeddings flair_embedding_forward <- FlairEmbeddings('news-forward') # Define the two sentences @@ -585,9 +609,11 @@ bank_embedding_sentence2 = sentence2[6]$embedding # "bank" is the sixth word ```
+ Same word, similar vector representation, but essentially different. In this way, you can see how the dynamic embeddings for "bank" in the two sentences differ based on context. Although we printed the embeddings here, in reality, they would be high-dimensional vectors, so you might see a lot of numbers. If you want a more intuitive view of the differences, you could compute the cosine similarity or other metrics between the two embeddings. This is just a simple demonstration. In practice, you can also combine multiple embedding techniques, such as `WordEmbeddings` and `FlairEmbeddings`, to get richer word vectors. +
```{r} @@ -599,22 +625,28 @@ cosine(as.numeric( bank_embedding_sentence1$numpy()), **Character-Based**
+ Flair uses a character-level language model, meaning it can generate embeddings for rare words or even misspelled words. This is an important feature because it allows the model to understand and process words that have never appeared in the training data. Flair uses a bidirectional LSTM (Long Short-Term Memory) network that operates at a character level. This allows it to feed individual characters into the LSTM instead of words. +
**Multilingual Support**
+ Flair provides various pre-trained character-level language models, supporting contextual word embeddings for multiple languages. It allows you to easily combine different word embeddings (e.g., Flair Embeddings, Word2Vec, GloVe, etc.) to create powerful stacked embeddings. +
## Classic Wordembeddings
+ In Flair, the simplest form of embeddings that still contains semantic information about the word are called classic word embeddings. These embeddings are pre-trained and non-contextual. Let's retrieve a few word embeddings and use FastText embeddings with the following code. To do so, we simply instantiate a WordEmbeddings class by passing in the ID of the embedding of our choice. Then, we simply wrap our text into a Sentence object, and call the `embed(sentence)` method on our WordEmbeddings class. +
@@ -632,7 +664,9 @@ for (i in seq_along(sentence$tokens)) { ```
+ Flair supports a range of classic word embeddings, each offering unique features and application scopes. Below is an overview, detailing the ID required to load each embedding and its corresponding language. +
| Embedding Type | ID | Language | @@ -674,15 +708,16 @@ Flair supports a range of classic word embeddings, each offering unique features | FastText (news & Wikipedia) | tr | Turkish | | FastText (news & Wikipedia) | zh | Chinese | -
## Contexual Embeddings
+ The idea behind contextual string embeddings is that each word embedding should be defined by not only its syntactic-semantic meaning but also the context it appears in. What this means is that each word will have a different embedding for every context it appears in. Each pre-trained Flair model offers a **forward** version and a **backward** version. Let's assume you are processing a language that, just like this text, uses the left-to-right script. The forward version takes into account the context that happens before the word -- on the left-hand side. The backward version works in the opposite direction. It takes into account the context after the word -- on the right-hand side of the word. If this is true, then two same words that appear at the beginning of two different sentences should have identical forward embeddings, because their context is null. Let's test this out: Because we are using a forward model, it only takes into account the context that occurs before a word. Additionally, since our word has no context on the left-hand side of its position in the sentence, the two embeddings are identical, and the code assumes they are identical, indeed output is **True**. +
@@ -698,7 +733,9 @@ cat(" s1 sentence:", paste(s1[0], sep = ""), "\n", "s2 sentence:", paste(s2[0], ```
+ We test whether the sum of the two 2048 embeddings of `nice` is equal to 2048. If it is true, it indicates that the embedding results are consistent, which should theoretically be the case. +
@@ -723,9 +760,11 @@ length(s1[0]$embedding$numpy()) == sum(s1[0]$embedding$numpy() == s2[0]$embeddi ```
+ The measure of similarity between two vectors in an inner product space is known as cosine similarity. The formula for calculating cosine similarity between two vectors, such as vectors A and B, is as follows: $Cosine Similarity = \frac{\sum_{i} (A_i \cdot B_i)}{\sqrt{\sum_{i} (A_i^2)} \cdot \sqrt{\sum_{i} (B_i^2)}}$ +
```{r} @@ -747,7 +786,9 @@ print(cosine_similarity) ## Extracting Embeddings from BERT
-First, we utilize the `flair.embeddings.TransformerWordEmbeddings` function to download BERT, and more transformer models can also be found on [Flair NLP's Hugging Face](https://huggingface.co/flair). + +First, we utilize the `TransformerWordEmbeddings` function to download BERT, and more transformer models can also be found on [Flair NLP's Hugging Face](https://huggingface.co/flair). +
```{r} @@ -848,8 +889,6 @@ glove_plot3D <- plot_ly(data = word_embeddings_matrix, glove_plot3D ``` - - ### Stack Embeddings Method (GloVe + Back/forwad FlairEmbeddings or More) ```{r} @@ -912,7 +951,6 @@ stacked_plot2D <- ggplot(word_embeddings_matrix, aes(x = PC1, y = PC2, color = P scale_color_gradient(low = "blue", high = "red") + theme_minimal() + labs(title = "", x = "PC1", y = "PC2", color = "PC3") - # guides(color = "none") stacked_plot2D ``` @@ -983,23 +1021,21 @@ figure <- ggarrange(glove_plot2D, stacked_plot2D, bert_plot2D, figure ``` + ------------------------------------------------------------------------ # Training a Binary Classifier -
In this section, we'll train a sentiment analysis model that can categorize text as either positive or negative. This case study is adapted from pages 116 to 130 of Tadej Magajna's book, '[Natural Language Processing with Flair](https://www.packtpub.com/product/natural-language-processing-with-flair/9781801072311)'. The process for training text classifiers in Flair mirrors the process followed for sequence labeling models. Specifically, the steps to train text classifiers are: + - Load a tagged corpus and compute the label dictionary map. - Prepare the document embeddings. - Initialize the `TextClassifier` class. - Train the model. -
- - ## Loading a Tagged Corpus
@@ -1041,7 +1077,9 @@ label_dict = corpus$make_label_dictionary(label_type=lbl_type) ## Loading the Embeddings
+ flaiR covers all the different types of document embeddings that we can use. Here, we simply use `DocumentPoolEmbeddings`. They require no training prior to training the classification model itself: +
@@ -1052,8 +1090,6 @@ glove = WordEmbeddings('glove') document_embeddings = DocumentPoolEmbeddings(glove) ``` - - ## Initializing the TextClassifier ```{r} @@ -1064,9 +1100,9 @@ classifier <- TextClassifier(document_embeddings, label_type = lbl_type) ``` -
+ `$to` allows you to set the device to use CPU, GPU, or specific MPS devices on Mac (such as mps:0, mps:1, mps:2). -
+ ```{r eval=FALSE, include=TRUE} classifier$to(flair_device("mps")) @@ -1094,13 +1130,17 @@ TextClassifier( ## Training the Model -
+ Training the text classifier model involves two simple steps: -- Defining the model trainer class by passing in the classifier model and the corpus -- Setting off the training process passing in the required training hyper-parameters. +- Defining the model trainer class by passing in the classifier model and the corpus +- Setting off the training process passing in the required training hyper-parameters. + + +
**It is worth noting that the 'L' in numbers like 32L and 5L is used in R to denote that the number is an integer. Without the 'L' suffix, numbers in R are treated as numeric, which are by default double-precision floating-point numbers. In contrast, Python determines the type based on the value of the number itself. Whole numbers (e.g., 5 or 32) are of type int, while numbers with decimal points (e.g., 5.0) are of type float. Floating-point numbers in both languages are representations of real numbers but can have some approximation due to the way they are stored in memory.** +
@@ -1126,7 +1166,9 @@ trainer$train('classifier', ## Loading and Using the Classifiers
+ After training the text classification model, the resulting classifier will already be stored in memory as part of the classifier variable. It is possible, however, that your Python session exited after training. If so, you'll need to load the model into memory with the following: +
```{r} @@ -1158,7 +1200,9 @@ print(sentence$labels) # Training RNNs
+ Here, we train a sentiment analysis model to categorize text. In this case, we also include a pipeline that implements the use of Recurrent Neural Networks (RNN). This makes them particularly effective for tasks involving sequential data. This section also show you how to implement one of most powerful features in flaiR, stacked embeddings. You can stack multiple embeddings with different layers and let the classifier learn from different types of features. In Flair NLP, and with the **flaiR** package, it's very easy to accomplish this task. +
## Import Necessary Modules @@ -1176,7 +1220,9 @@ ModelTrainer <- flair_trainers()$ModelTrainer ## Get the IMDB Corpus
+ The IMDB movie review dataset is used here, which is a commonly utilized dataset for sentiment analysis. `$downsample(0.1)` method means only 10% of the dataset is used, allowing for a faster demonstration. +
@@ -1192,7 +1238,9 @@ label_dict <- corpus$make_label_dictionary(label_type=lbl_type) ## Stacked Embeddings
+ This is one of Flair's most powerful features: it allows for the integration of embeddings to enable the model to learn from more sparse features. Three types of embeddings are utilized here: GloVe embeddings, and two types of Flair embeddings (forward and backward). Word embeddings are used to convert words into vectors. +
@@ -1227,6 +1275,7 @@ For the sake of this example, setting max_epochs to 5. You might want to increas It is worth noting that the learning rate is a parameter that determines the step size at each iteration while moving towards a minimum of the loss function. A smaller learning rate could slow down the learning process, but it could lead to more precise convergence. `mini_batch_size` determines the number of samples that will be used to compute the gradient at each step. The 'L' in 32L is used in R to denote that the number is an integer. `patience` (aka early stop) is a hyper-parameter used in conjunction with early stopping to avoid overfitting. It determines the number of epochs the training process will tolerate without improvements before stopping the training. Setting max_epochs to 5 means the algorithm will make five passes through the dataset. +
@@ -1242,7 +1291,6 @@ trainer$train('models/sentiment', ## To Apply the Trained Model for Prediction - ```{r eval=FALSE, include=TRUE} sentence <- "This movie was really exciting!" classifier$predict(sentence) @@ -1254,7 +1302,9 @@ print(sentence.labels) # Finetune Transformers
+ We use data from *The Temporal Focus of Campaign Communication (2020 JOP)* as an example. Let's assume we receive the data for training from different times. First, suppose you have a dataset of 1000 entries called `cc_muller_old`. On another day, with the help of nice friends, you receive another set of data, adding 2000 entries in a dataset called `cc_muller_new`. Both subsets are from `data(cc_muller)`. We will show how to fine-tune a transformer model with `cc_muller_old`, and then continue with another round of fine-tuning using `cc_muller_new`. +
```{r} @@ -1265,7 +1315,6 @@ library(flaiR) ## Fine-tuning a Transformers Model - **Step 1** Load Necessary Modules from Flair Load necessary classes from `flair` package. @@ -1288,7 +1337,9 @@ ModelTrainer <- flair_trainers()$ModelTrainer ```
+ We use purrr to help us split sentences using Sentence from `flair_data()`, then use map2 to add labels, and finally use `Corpus` to segment the data. +
@@ -1324,9 +1375,10 @@ old_dev <- old_test[!test_id] ```
+ If you do not provide a development set (dev set) while using Flair, it will automatically split the training data into training and development datasets. The test set is used for training the model and evaluating its final performance, whereas the development set is used for adjusting model parameters and preventing overfitting, or in other words, for early stopping of the model. -
+
```{r} old_corpus <- Corpus(train = old_train, test = old_test) @@ -1339,7 +1391,9 @@ document_embeddings <- TransformerDocumentEmbeddings('distilbert-base-uncased', ```
+ First, the `$make_label_dictionary` function is used to automatically create a label dictionary for the classification task. The label dictionary is a mapping from label to index, which is used to map the labels to a tensor of label indices. Besides classification tasks, flaiR also supports other label types for training custom model. From the cc_muller dataset: Future (seen 423 times), Present (seen 262 times), Past (seen 131 times). +
@@ -1348,7 +1402,9 @@ old_label_dict <- old_corpus$make_label_dictionary(label_type="classification") ```
+ `TextClassifier` is used to create a text classifier. The classifier takes the document embeddings (importing from `'distilbert-base-uncased'` from Hugging Face) and the label dictionary as input. The label type is also specified as classification. +
```{r} @@ -1381,10 +1437,14 @@ old_trainer$train("vignettes/inst/muller-campaign-communication", Now, we can continue to fine tune the already fine tuned model with an additional 2000 pieces of data. First, let's say we have another 2000 entries called `cc_muller_new`. We can fine-tune the previous model with these 2000 entries. The steps are the same as before. For this case, we don't need to split the dataset again. We can use the entire 2000 entries as the training set and use the `old_test` set to evaluate how well our refined model performs. +
+ **Step 1** Load the `muller-campaign-communication` Model + Load the model (`old_model`) you have already fine tuned from previous stage and let's fine tune it with the new data, `new_corpus`. + ```{r} old_model <- TextClassifier$load("vignettes/inst/muller-campaign-communication/best-model.pt") ``` @@ -1425,6 +1485,7 @@ new_trainer$train("vignettes/inst/new-muller-campaign-communication", ## Model Performance Metrics: Pre and Post Fine-tuning + After fine-tuning for 1 epoch, the model showed improved performance on the same test set. @@ -1435,7 +1496,7 @@ After fine-tuning for 1 epoch, the model showed improved performance on the same | Accuracy | 0.7294 | 0.8471 | +0.1177 | More R tutorial and documentation see [here](https://github.com/davidycliao/flaiR). -
+ ------------------------------------------------------------------------ @@ -1443,23 +1504,33 @@ More R tutorial and documentation see [here](https://github.com/davidycliao/flai # Extending conText's Embedding Regression
+ `ConText` is a fast, flexible, and transparent framework for estimating context-specific word and short document embeddings using [the 'a la carte' embeddings regression](https://github.com/prodriguezsosa/EmbeddingRegression), implemented by Rodriguez et al. In this case study, I'll demonstrate how to use the conText package alongside other embedding frameworks by working through the example provided in Rodriguez et al.'s [Quick Start Guide](https://github.com/prodriguezsosa/conText/blob/master/vignettes/quickstart.md). While ConText includes its own cross-lingual ALC Embeddings, this tutorial extends its capabilities by integrating it with flaiR. Through this tutorial integration, this tutorial shows how to: +
+ + - Access flaiR's powerful embedding models - Connect with any transformer-based embedding models from HuggingFace via FlaiR + +
+ I'll be following the example directly from [Rodriguez et al.'s Quick Start Guide]() as this case study. It's important to note that results obtained using alternative embedding frameworks may deviate from the original implementation, and should be interpreted with caution. These comparative results are primarily intended for reference and educational use. First of all, when loading the conText package, you'll find three pre-loaded datasets: `cr_sample_corpus`, `cr_glove_subset`, and `cr_transform.` These datasets are used in the package's tutorial to demonstrate preprocessing steps. For this exercise, I'll use `cr_sample_corpus` to explore other embedding frameworks, including: +
+ + - `en-crawl` embedding - Flair NLP contextual embeddings (as described in [Akbik et al., COLING 2018 paper](https://flairnlp.github.io/docs/tutorial-embeddings/flair-embeddings)) - Integrated embeddings extracted from transformers like BERT. - + ## Build Document-Embedding-Matrix with Other Embedding Frameworks @@ -1473,12 +1544,9 @@ library(flaiR) **Step 1** Tokenize Text with `quanteda` and `conText` -
First, let's tokenize `cr_sample_corpus` using the tokens_context function from the conText package. -
- ```{r} # tokenize corpus removing unnecessary (i.e. semantically uninformative) elements toks <- tokens(cr_sample_corpus, remove_punct=T, remove_symbols=T, remove_numbers=T, remove_separators=T) @@ -1502,7 +1570,9 @@ immig_dfm <- dfm(immig_toks) **Step 2** Import Embedding Tools
+ To facilitate the loading of different embedding types, I'll import the following __classes__ and __functions__ from flaiR: `WordEmbeddings`, `FlairEmbeddings`, `TransformerWordEmbeddings`, `StackedEmbeddings`, and `Sentence.` These components enable us to work with GloVe embeddings, Flair's contextual embeddings, and transformer-based embeddings from the HuggingFace library. +
```{r include=FALSE} @@ -1520,7 +1590,9 @@ fasttext_embeddings <- WordEmbeddings('en-news') ```
+ Initialize a Flair Sentence object by concatenating cr_glove_subset row names. The collapse parameter ensures proper tokenization by adding space delimiters. Then, embed the sentence text using the loaded fasttext embeddings. +
```{r} @@ -1532,16 +1604,21 @@ fasttext_embeddings$embed(sentence) ```
+ The `process_embeddings` function from flaiR extracts pre-embedded GloVe vectors from a sentence object and arranges them into a structured matrix. In this matrix, tokens are represented as rows, embedding dimensions as columns, and each row is labeled with its corresponding token text. +
```{r} fasttext_subset <- process_embeddings(sentence, verbose = TRUE) ``` + **Step 3** Computing Context-Specific Word Embeddings Using FastText
+ Create a feature co-occurrence matrix (FCM) from tokenized text and transform pre-trained FastText embeddings using co-occurrence information. +
```{r} @@ -1572,6 +1649,7 @@ immig_dem_ft <- dem(x = immig_dfm, ``` Show each document inherits its corresponding docvars. + ```{r} head(immig_dem_ft@docvars) ``` @@ -1596,6 +1674,7 @@ ft_model <- conText(formula = immigration ~ party + gender, ``` extract D-dimensional beta coefficients. + ```{r} # The intercept in this case is the fastext embedding for female Democrats # beta coefficients can be combined to get each group's fastext embedding @@ -1605,9 +1684,8 @@ RF_wv <- ft_model['(Intercept)',] + ft_model['party_R',] # (R)epublican - (F)em RM_wv <- ft_model['(Intercept)',] + ft_model['party_R',] + ft_model['gender_M',] # (R)epublican - (M)ale ``` - +nearest neighbors ```{r} -# nearest neighbors nns(rbind(DF_wv,DM_wv), N = 10, pre_trained = fasttext_subset, @@ -1647,7 +1725,6 @@ immig_wv_ft_party <- dem_group(immig_dem_ft, dim(immig_wv_ft_party) ``` - Find nearest neighbors by party ```{r} # find nearest neighbors by party @@ -1741,7 +1818,9 @@ alc_model <- conText(formula = immigration ~ party + gender, ### Document Embedding Matrix Construction Using Flair Contextual Stacked Embeddings
-To facilitate the loading of different embedding types, I'll import the following __classes__ and __functions__ from flaiF: `WordEmbeddings`, `FlairEmbeddings`, `TransformerWordEmbeddings`, `StackedEmbeddings`, and `Sentence.` These components enable us to work with GloVe embeddings, Flair's contextual embeddings, and transformer-based embeddings from the HuggingFace library. + +To facilitate the loading of different embedding types, I'll import the following __classes__ and __functions__ from flaiF: `WordEmbeddings`, `FlairEmbeddings`, `TransformerWordEmbeddings`, `StackedEmbeddings`, and `Sentence`. These components enable us to work with GloVe embeddings, Flair's contextual embeddings, and transformer-based embeddings from the HuggingFace library. +
```{r include=FALSE} @@ -1768,7 +1847,6 @@ flair_backward <- FlairEmbeddings('news-backward') ``` -
Combine three different types of embeddings into a stacked embedding model. This creates a stacked embedding model that combines: @@ -1776,7 +1854,7 @@ This creates a stacked embedding model that combines: - `FastText embeddings`: Captures general word semantics - `Forward Flair`: Captures contextual information reading text left-to-right - `Backward Flair`: Captures contextual information reading text right-to-left -
+ ```{r} stacked_embeddings <- StackedEmbeddings(list( @@ -1787,9 +1865,6 @@ stacked_embeddings <- StackedEmbeddings(list( ``` ```{r} - -fasttext_subset <- process_embeddings(sentence, verbose = TRUE) - # Step 1: Create a Flair Sentence object from the text sentence <- Sentence(paste(rownames(cr_glove_subset), collapse = " ")) @@ -1835,9 +1910,9 @@ st_model <- conText(formula = immigration ~ party + gender, ### Document Embedding Matrix Construction with BERT
-BERT embeddings provide powerful contextual representations through their bidirectional transformer architecture. These embeddings are good at understanding context from both directions within text, generating deep contextual representations through multiple transformer layers, and leveraging pre-training on large text corpora to achieve strong performance across NLP tasks. +BERT embeddings provide powerful contextual representations through their bidirectional transformer architecture. These embeddings are good at understanding context from both directions within text, generating deep contextual representations through multiple transformer layers, and leveraging pre-training on large text corpora to achieve strong performance across NLP tasks. The classic BERT base model generates 768-dimensional embeddings for each token, providing rich semantic representations. -The classic BERT base model generates 768-dimensional embeddings for each token, providing rich semantic representations. +
By utilizing the Flair framework, we also can seamlessly integrate: @@ -1846,7 +1921,6 @@ By utilizing the Flair framework, we also can seamlessly integrate: - Domain-adapted BERT models - Any transformer model available on HuggingFace - ```{r} @@ -1854,7 +1928,6 @@ By utilizing the Flair framework, we also can seamlessly integrate: bert_embeddings <- TransformerWordEmbeddings('bert-base-uncased') ``` - ```{r} # Step 1: Create a Flair Sentence object from the text sentence <- Sentence(paste(rownames(cr_glove_subset), collapse = " ")) @@ -1896,6 +1969,7 @@ bt_model <- conText(formula = immigration ~ party + gender, ### Comparision of Different Embedding Approaches
+ While this tutorial doesn't determine a definitive "best" approach, it's important to understand the key distinctions between word embedding methods. BERT, FastText, Flair Stacked Embeddings, and GloVe can be categorized into two groups: dynamic and static embeddings. Dynamic embeddings, such as BERT and Flair, adapt their word representations based on context using high-dimensional vector spaces (BERT uses 768 dimensions in its base model). These methods excel at handling contextual nuances: BERT employs self-attention mechanisms and subword tokenization, while Flair uses character-level modeling. Both effectively handle out-of-vocabulary words through these mechanisms. From 206241a68d05c07cd8b472afb7f317988f33d900 Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sat, 28 Dec 2024 15:08:00 +0000 Subject: [PATCH 17/46] UPDATE 0.0.7 --- tests/testthat/test_get_entities.R | 3 --- tests/testthat/test_util.R | 17 ----------------- 2 files changed, 20 deletions(-) diff --git a/tests/testthat/test_get_entities.R b/tests/testthat/test_get_entities.R index decc11b8..9250da46 100644 --- a/tests/testthat/test_get_entities.R +++ b/tests/testthat/test_get_entities.R @@ -1,6 +1,5 @@ test_that("get_entities works with standard NER model", { - # 設置測試數據 texts <- c( "John Smith works at Google in New York.", "The Eiffel Tower was built in 1889." @@ -63,7 +62,6 @@ test_that("get_entities handles different parameters correctly", { test_that("get_entities error handling", { tagger_std <- load_tagger_ner('ner') - # 測試空文本 expect_error( get_entities(texts = character(0), tagger = tagger_std), "The texts cannot be NULL or empty." @@ -79,7 +77,6 @@ test_that("get_entities error handling", { "The lengths of texts and doc_ids do not match." ) - # 測試無效的批次大小 expect_error( get_entities( texts = "text", diff --git a/tests/testthat/test_util.R b/tests/testthat/test_util.R index 9a8f2b56..1c24ec3a 100644 --- a/tests/testthat/test_util.R +++ b/tests/testthat/test_util.R @@ -115,20 +115,3 @@ test_that("check_texts_and_ids handles input correctly", { expect_equal(res$doc_ids, c("id1", "id2")) }) - -# Test 14: check_flair_installed identifies whether flair is available - -test_that("check_flair_installed identifies whether flair is available", { - # Mocking that the module is available - with_mock( - `reticulate::py_module_available` = function(...) TRUE, - expect_true(check_flair_installed()) - ) - - # Mocking that the module is not available - with_mock( - `reticulate::py_module_available` = function(...) FALSE, - expect_false(check_flair_installed()) - ) -}) - From 9b7d62918c50623cbc9e3d4f1b82e7bec4a62f15 Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sat, 28 Dec 2024 15:22:10 +0000 Subject: [PATCH 18/46] UPDATE 0.0.7 --- .github/workflows/r_macos.yml | 208 ++++++++++++++++++++++++---------- vignettes/tutorial.Rmd | 8 +- 2 files changed, 151 insertions(+), 65 deletions(-) diff --git a/.github/workflows/r_macos.yml b/.github/workflows/r_macos.yml index 0f531561..504b01af 100644 --- a/.github/workflows/r_macos.yml +++ b/.github/workflows/r_macos.yml @@ -5,6 +5,87 @@ # # See https://github.com/r-lib/actions/tree/master/examples#readme for # additional example workflows available for the R community. +# name: R-MacOS +# +# on: +# push: +# branches: [ "main" ] +# pull_request: +# branches: [ "main" ] +# +# permissions: +# contents: read +# +# jobs: +# build: +# runs-on: macos-latest +# +# strategy: +# matrix: +# r-version: ['4.4.0', '4.3.2'] +# +# steps: +# - uses: actions/checkout@v3 +# +# - name: Update Homebrew +# run: | +# brew update +# +# - name: Install pandoc +# run: | +# for i in {1..3}; do +# brew install pandoc && break || sleep 15 +# done +# +# - name: Install gfortran and configure Makevars +# run: | +# brew install gcc +# mkdir -p ~/.R +# touch ~/.R/Makevars +# echo "FC=$(brew --prefix)/bin/gfortran" >> ~/.R/Makevars +# echo "F77=$(brew --prefix)/bin/gfortran" >> ~/.R/Makevars +# echo "FLIBS=-L$(brew --prefix)/lib/gcc/current -lgfortran -lquadmath -lm" >> ~/.R/Makevars +# echo "LDFLAGS=-L$(brew --prefix)/lib/gcc/current" >> ~/.R/Makevars +# +# - name: Set up R ${{ matrix.r-version }} +# uses: r-lib/actions/setup-r@v2 +# with: +# r-version: ${{ matrix.r-version }} +# +# - name: Install R dependencies +# run: | +# Rscript -e "install.packages(c('remotes', 'rcmdcheck', 'reticulate', 'renv', 'knitr', 'rmarkdown', 'lsa', 'purrr', 'testthat', 'htmltools'), repos='https://cran.r-project.org')" +# Rscript -e "if (getRversion() >= '4.4.0') remotes::install_version('Matrix', version = '1.5.3') else install.packages('Matrix', type = 'binary')" +# Rscript -e "remotes::install_version('htmltools', version = '0.5.8')" +# Rscript -e "renv::restore()" +# +# - name: Set up Python +# uses: actions/setup-python@v2 +# with: +# python-version: '3.10.x' +# +# - name: Install Python virtualenv +# run: pip install virtualenv +# +# - name: Create Python virtual environment +# run: virtualenv flair_env +# +# - name: Install Python dependencies in virtual environment +# run: | +# source flair_env/bin/activate +# pip install --upgrade pip +# pip install scipy==1.12.0 +# pip install flair +# +# - name: Remove Python cache files +# run: find . -name '*.pyc' -delete +# +# - name: Check (with virtual environment) +# run: | +# source flair_env/bin/activate +# R CMD build --no-build-vignettes . +# shell: bash + name: R-MacOS on: @@ -19,69 +100,74 @@ permissions: jobs: build: runs-on: macos-latest - strategy: matrix: r-version: ['4.4.0', '4.3.2'] + fail-fast: false steps: - - uses: actions/checkout@v3 - - - name: Update Homebrew - run: | - brew update - - - name: Install pandoc - run: | - for i in {1..3}; do - brew install pandoc && break || sleep 15 - done - - - name: Install gfortran and configure Makevars - run: | - brew install gcc - mkdir -p ~/.R - touch ~/.R/Makevars - echo "FC=$(brew --prefix)/bin/gfortran" >> ~/.R/Makevars - echo "F77=$(brew --prefix)/bin/gfortran" >> ~/.R/Makevars - echo "FLIBS=-L$(brew --prefix)/lib/gcc/current -lgfortran -lquadmath -lm" >> ~/.R/Makevars - echo "LDFLAGS=-L$(brew --prefix)/lib/gcc/current" >> ~/.R/Makevars - - - name: Set up R ${{ matrix.r-version }} - uses: r-lib/actions/setup-r@v2 - with: - r-version: ${{ matrix.r-version }} - - - name: Install R dependencies - run: | - Rscript -e "install.packages(c('remotes', 'rcmdcheck', 'reticulate', 'renv', 'knitr', 'rmarkdown', 'lsa', 'purrr', 'testthat', 'htmltools'), repos='https://cran.r-project.org')" - Rscript -e "if (getRversion() >= '4.4.0') remotes::install_version('Matrix', version = '1.5.3') else install.packages('Matrix', type = 'binary')" - Rscript -e "remotes::install_version('htmltools', version = '0.5.8')" - Rscript -e "renv::restore()" - - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: '3.10.x' - - - name: Install Python virtualenv - run: pip install virtualenv - - - name: Create Python virtual environment - run: virtualenv flair_env - - - name: Install Python dependencies in virtual environment - run: | - source flair_env/bin/activate - pip install --upgrade pip - pip install scipy==1.12.0 - pip install flair - - - name: Remove Python cache files - run: find . -name '*.pyc' -delete - - - name: Check (with virtual environment) - run: | - source flair_env/bin/activate - R CMD build --no-build-vignettes . - shell: bash + - uses: actions/checkout@v3 + + - name: Update Homebrew + run: | + brew update + + - name: Install pandoc + run: | + for i in {1..3}; do brew install pandoc && break || sleep 15 done + + - name: Install gfortran and configure Makevars + run: | + brew install gcc + mkdir -p ~/.R + touch ~/.R/Makevars + echo "FC=$(brew --prefix)/bin/gfortran" >> ~/.R/Makevars + echo "F77=$(brew --prefix)/bin/gfortran" >> ~/.R/Makevars + echo "FLIBS=-L$(brew --prefix)/lib/gcc/current -lgfortran -lquadmath -lm" >> ~/.R/Makevars + echo "LDFLAGS=-L$(brew --prefix)/lib/gcc/current" >> ~/.R/Makevars + + - name: Set up R ${{ matrix.r-version }} + uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.r-version }} + + - name: Install R dependencies + run: | + # 基礎包安裝 + Rscript -e "install.packages(c('remotes', 'rcmdcheck', 'reticulate', 'renv', 'knitr', 'rmarkdown', 'lsa', 'purrr', 'testthat'), repos='https://cran.r-project.org')" + + # 根據 R 版本安裝適當的 Matrix 版本 + Rscript -e 'if (getRversion() >= "4.4.0") remotes::install_version("Matrix", version = "1.5.3") else remotes::install_version("Matrix", version = "1.5.1")' + + # 安裝特定版本的 htmltools + Rscript -e "remotes::install_version('htmltools', version = '0.5.8')" + + # 最後執行 renv::restore() + Rscript -e "renv::restore()" + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.10.x' + + - name: Install Python virtualenv + run: pip install virtualenv + + - name: Create Python virtual environment + run: virtualenv flair_env + + - name: Install Python dependencies in virtual environment + run: | + source flair_env/bin/activate + pip install --upgrade pip + pip install scipy==1.12.0 + pip install flair + + - name: Remove Python cache files + run: find . -name '*.pyc' -delete + + - name: Check (with virtual environment) + run: | + source flair_env/bin/activate + R CMD build --no-build-vignettes . + shell: bash diff --git a/vignettes/tutorial.Rmd b/vignettes/tutorial.Rmd index 2761a2cc..6ae6f6c5 100644 --- a/vignettes/tutorial.Rmd +++ b/vignettes/tutorial.Rmd @@ -1970,13 +1970,13 @@ bt_model <- conText(formula = immigration ~ party + gender,
-While this tutorial doesn't determine a definitive "best" approach, it's important to understand the key distinctions between word embedding methods. BERT, FastText, Flair Stacked Embeddings, and GloVe can be categorized into two groups: dynamic and static embeddings. +While this tutorial doesn't determine a definitive best approach, it's important to understand the key distinctions between word embedding methods. BERT, FastText, Flair Stacked Embeddings, and GloVe can be categorized into two groups: dynamic and static embeddings. -Dynamic embeddings, such as BERT and Flair, adapt their word representations based on context using high-dimensional vector spaces (BERT uses 768 dimensions in its base model). These methods excel at handling contextual nuances: BERT employs self-attention mechanisms and subword tokenization, while Flair uses character-level modeling. Both effectively handle out-of-vocabulary words through these mechanisms. +Dynamic embeddings, such as BERT and Flair, adapt their word representations based on context using high-dimensional vector spaces (BERT uses 768 dimensions in its base model). BERT employs self-attention mechanisms and subword tokenization, while Flair uses character-level modeling. Both effectively handle out-of-vocabulary words through these mechanisms. -Static embeddings, like FastText (300 dimensions) and GloVe (typically 100-300 dimensions), maintain consistent word representations regardless of context. FastText stands out among static embeddings by incorporating subword information, giving it an advantage in handling out-of-vocabulary words, while GloVe relies on pre-computed global word-word co-occurrence statistics. +However, it's worth noting that in their case study, where they provide selected words, in our case study we directly extract individual word vectors from BERT and Flair (forward/backward) embeddings using those selected words. This approach doesn't truly achieve the intended contextual modeling. A more meaningful approach would be to extract embeddings at the quasi-sentence or paragraph level. Alternatively, pooling the entire document before extracting embeddings could be more valuable. -Each embedding method has its unique characteristics and trade-offs. While they share the common goal of capturing meaningful textual relationships, they differ in their balance of contextual awareness, computational demands, and representation stability. The choice among them depends on specific project requirements, including accuracy needs, computational resources, and the nature of the text being analyzed. +These context-based approaches differ significantly from GloVe's methodology, which relies on pre-computed global word-word co-occurrence statistics to generate static word vectors.
From 07717155261a3a80177ea3c742b6d197d94ccad3 Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sat, 28 Dec 2024 15:26:33 +0000 Subject: [PATCH 19/46] Update r_macos.yml --- .github/workflows/r_macos.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/r_macos.yml b/.github/workflows/r_macos.yml index 504b01af..045ab67e 100644 --- a/.github/workflows/r_macos.yml +++ b/.github/workflows/r_macos.yml @@ -85,7 +85,6 @@ # source flair_env/bin/activate # R CMD build --no-build-vignettes . # shell: bash - name: R-MacOS on: @@ -109,12 +108,14 @@ jobs: - uses: actions/checkout@v3 - name: Update Homebrew - run: | - brew update + run: brew update - name: Install pandoc run: | - for i in {1..3}; do brew install pandoc && break || sleep 15 done + for i in {1..3} + do + brew install pandoc && break || sleep 15 + done - name: Install gfortran and configure Makevars run: | From 8e2204b6354594c22a933ba6c8719834e8883f76 Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sat, 28 Dec 2024 15:32:55 +0000 Subject: [PATCH 20/46] Update r_macos.yml --- .github/workflows/r_macos.yml | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/.github/workflows/r_macos.yml b/.github/workflows/r_macos.yml index 045ab67e..f88214c9 100644 --- a/.github/workflows/r_macos.yml +++ b/.github/workflows/r_macos.yml @@ -135,16 +135,22 @@ jobs: - name: Install R dependencies run: | # 基礎包安裝 - Rscript -e "install.packages(c('remotes', 'rcmdcheck', 'reticulate', 'renv', 'knitr', 'rmarkdown', 'lsa', 'purrr', 'testthat'), repos='https://cran.r-project.org')" + Rscript -e 'install.packages(c("remotes", "rcmdcheck", "reticulate", "renv", "knitr", "rmarkdown", "lsa", "purrr", "testthat"), repos="https://cran.r-project.org")' - # 根據 R 版本安裝適當的 Matrix 版本 - Rscript -e 'if (getRversion() >= "4.4.0") remotes::install_version("Matrix", version = "1.5.3") else remotes::install_version("Matrix", version = "1.5.1")' + # 根據 R 版本有條件地安裝 Matrix + Rscript -e ' + if (getRversion() >= "4.4.0") { + install.packages("Matrix") + } else { + remotes::install_version("Matrix", version = "1.5.1", repos = "https://cran.r-project.org") + } + ' - # 安裝特定版本的 htmltools - Rscript -e "remotes::install_version('htmltools', version = '0.5.8')" + # 安裝指定版本的 htmltools + Rscript -e 'remotes::install_version("htmltools", version = "0.5.8")' # 最後執行 renv::restore() - Rscript -e "renv::restore()" + Rscript -e 'renv::restore()' - name: Set up Python uses: actions/setup-python@v2 From 2cfb4b647afb895e48465cab8d145436822ae4a6 Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sat, 28 Dec 2024 15:37:39 +0000 Subject: [PATCH 21/46] Update r_ubuntu.yaml --- .github/workflows/r_ubuntu.yaml | 282 ++++++++++++++++++++++---------- 1 file changed, 199 insertions(+), 83 deletions(-) diff --git a/.github/workflows/r_ubuntu.yaml b/.github/workflows/r_ubuntu.yaml index bc18b1d7..c1ad187b 100644 --- a/.github/workflows/r_ubuntu.yaml +++ b/.github/workflows/r_ubuntu.yaml @@ -74,92 +74,208 @@ # R CMD check *tar.gz --no-build-vignettes --no-manual --no-examples # shell: bash # +# # +# name: R-ubuntu +# +# on: +# push: +# branches: +# - main +# pull_request: +# branches: +# - main +# +# jobs: +# R-CMD-check: +# runs-on: ubuntu-20.04 +# strategy: +# matrix: +# r-version: ['4.3.2', '4.2.0', '4.2.1'] +# +# env: +# R_LIBS_USER: /home/runner/work/_temp/Library +# TZ: UTC +# _R_CHECK_SYSTEM_CLOCK_: FALSE +# NOT_CRAN: true +# RSPM: https://packagemanager.posit.co/cran/__linux__/focal/latest +# RENV_CONFIG_REPOS_OVERRIDE: https://packagemanager.posit.co/cran/__linux__/focal/latest +# +# steps: +# - uses: actions/checkout@v3 +# +# - name: Cache R dependencies +# uses: actions/cache@v2 +# with: +# path: ~/R/x86_64-pc-linux-gnu-library/ +# key: ${{ runner.os }}-r-${{ matrix.r-version }}-${{ hashFiles('**/renv.lock') }} +# restore-keys: ${{ runner.os }}-r-${{ matrix.r-version }}- # +# - name: Setup R +# uses: r-lib/actions/setup-r@v2 +# with: +# use-public-rspm: true +# r-version: ${{ matrix.r-version }} +# +# - name: Restore R environment +# run: | +# if (!requireNamespace('renv', quietly = TRUE)) install.packages('renv') +# renv::restore() +# shell: Rscript {0} +# +# - name: Install additional R packages +# env: +# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the default GitHub token for authentication +# run: | +# install.packages(c("knitr", "rmarkdown", "lsa", "purrr", "ggplot2")) +# install.packages('remotes') +# remotes::install_github("davidycliao/flaiR", auth_token = Sys.getenv("GITHUB_TOKEN"), force = TRUE) +# shell: Rscript {0} +# +# - name: Set up Python +# uses: actions/setup-python@v2 +# with: +# python-version: '3.10.x' +# +# - name: Install Python virtualenv +# run: pip install virtualenv +# +# - name: Create Python virtual environment +# run: virtualenv flair_env +# +# - name: Install Python dependencies in virtual environment +# run: | +# source flair_env/bin/activate +# pip install --upgrade pip +# pip install scipy==1.12.0 +# pip install flair +# pip install gensim +# +# - name: Remove Python cache files +# run: find . -name '*.pyc' -delete +# +# - name: Check R environment status +# run: renv::status() +# shell: Rscript {0} +# +# # - name: Check R package (with virtual environment) +# # run: | +# # source flair_env/bin/activate +# # R CMD build . --no-build-vignettes +# # R CMD check *tar.gz --no-build-vignettes --no-manual --no-tests --no-examples +# # shell: bash + + name: R-ubuntu on: - push: - branches: - - main - pull_request: - branches: - - main + push: + branches: + - main + pull_request: + branches: + - main jobs: - R-CMD-check: - runs-on: ubuntu-20.04 - strategy: - matrix: - r-version: ['4.3.2', '4.2.0', '4.2.1'] - - env: - R_LIBS_USER: /home/runner/work/_temp/Library - TZ: UTC - _R_CHECK_SYSTEM_CLOCK_: FALSE - NOT_CRAN: true - RSPM: https://packagemanager.posit.co/cran/__linux__/focal/latest - RENV_CONFIG_REPOS_OVERRIDE: https://packagemanager.posit.co/cran/__linux__/focal/latest - - steps: - - uses: actions/checkout@v3 - - - name: Cache R dependencies - uses: actions/cache@v2 - with: - path: ~/R/x86_64-pc-linux-gnu-library/ - key: ${{ runner.os }}-r-${{ matrix.r-version }}-${{ hashFiles('**/renv.lock') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.r-version }}- - - - name: Setup R - uses: r-lib/actions/setup-r@v2 - with: - use-public-rspm: true - r-version: ${{ matrix.r-version }} - - - name: Restore R environment - run: | - if (!requireNamespace('renv', quietly = TRUE)) install.packages('renv') - renv::restore() - shell: Rscript {0} - - - name: Install additional R packages - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the default GitHub token for authentication - run: | - install.packages(c("knitr", "rmarkdown", "lsa", "purrr", "ggplot2")) - install.packages('remotes') - remotes::install_github("davidycliao/flaiR", auth_token = Sys.getenv("GITHUB_TOKEN"), force = TRUE) - shell: Rscript {0} - - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: '3.10.x' - - - name: Install Python virtualenv - run: pip install virtualenv - - - name: Create Python virtual environment - run: virtualenv flair_env - - - name: Install Python dependencies in virtual environment - run: | - source flair_env/bin/activate - pip install --upgrade pip - pip install scipy==1.12.0 - pip install flair - pip install gensim - - - name: Remove Python cache files - run: find . -name '*.pyc' -delete - - - name: Check R environment status - run: renv::status() - shell: Rscript {0} - - # - name: Check R package (with virtual environment) - # run: | - # source flair_env/bin/activate - # R CMD build . --no-build-vignettes - # R CMD check *tar.gz --no-build-vignettes --no-manual --no-tests --no-examples - # shell: bash + R-CMD-check: + runs-on: ubuntu-20.04 + strategy: + matrix: + r-version: ['4.3.2', '4.2.0', '4.2.1'] + fail-fast: false + + env: + R_LIBS_USER: /home/runner/work/_temp/Library + TZ: UTC + R_CHECK_SYSTEM_CLOCK: FALSE + NOT_CRAN: true + RSPM: https://packagemanager.posit.co/cran/__linux__/focal/latest + RENV_CONFIG_REPOS_OVERRIDE: https://packagemanager.posit.co/cran/__linux__/focal/latest + + steps: + - uses: actions/checkout@v3 + + - name: Cache R dependencies + uses: actions/cache@v3 + with: + path: ~/R/x86_64-pc-linux-gnu-library/ + key: ${{ runner.os }}-r-${{ matrix.r-version }}-${{ hashFiles('**/renv.lock') }} + restore-keys: ${{ runner.os }}-r-${{ matrix.r-version }}- + + - name: Setup R + uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + r-version: ${{ matrix.r-version }} + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libcurl4-openssl-dev libssl-dev libxml2-dev libfontconfig1-dev libharfbuzz-dev libfribidi-dev libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev + + - name: Install R Matrix package + run: | + install.packages('remotes') + if (getRversion() >= "4.3.0") { + remotes::install_version("Matrix", version = "1.5-1") + } else { + remotes::install_version("Matrix", version = "1.4-1") + } + shell: Rscript {0} + + - name: Restore R environment + run: | + if (!requireNamespace('renv', quietly = TRUE)) install.packages('renv') + renv::restore() + shell: Rscript {0} + + - name: Install additional R packages + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + install.packages(c("knitr", "rmarkdown", "lsa", "purrr", "ggplot2")) + tryCatch({ + remotes::install_github("davidycliao/flaiR", + auth_token = Sys.getenv("GITHUB_TOKEN"), + force = TRUE, + dependencies = TRUE) + }, error = function(e) { + message("Error installing flaiR: ", e$message) + quit(status = 1) + }) + shell: Rscript {0} + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10.x' + cache: 'pip' + + - name: Install Python virtualenv + run: pip install virtualenv + + - name: Create Python virtual environment + run: virtualenv flair_env + + - name: Install Python dependencies in virtual environment + run: | + source flair_env/bin/activate + pip install --upgrade pip + pip install scipy==1.12.0 + pip install flair + pip install gensim + deactivate + + - name: Remove Python cache files + run: find . -name '*.pyc' -delete + + - name: Check R environment status + run: renv::status() + shell: Rscript {0} + + - name: Check R package (with virtual environment) + run: | + source flair_env/bin/activate + set -e + R CMD build . --no-build-vignettes + R CMD check *tar.gz --no-build-vignettes --no-manual --no-tests --no-examples || true + shell: bash From bc32feec2d0f3509905fa01c32a6d6f69227bd0d Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sat, 28 Dec 2024 15:39:26 +0000 Subject: [PATCH 22/46] Update r_macos.yml --- .github/workflows/r_macos.yml | 138 +++++++++++++++++++++++++++++++--- 1 file changed, 128 insertions(+), 10 deletions(-) diff --git a/.github/workflows/r_macos.yml b/.github/workflows/r_macos.yml index f88214c9..4c34a6c9 100644 --- a/.github/workflows/r_macos.yml +++ b/.github/workflows/r_macos.yml @@ -85,6 +85,101 @@ # source flair_env/bin/activate # R CMD build --no-build-vignettes . # shell: bash +# name: R-MacOS +# +# on: +# push: +# branches: [ "main" ] +# pull_request: +# branches: [ "main" ] +# +# permissions: +# contents: read +# +# jobs: +# build: +# runs-on: macos-latest +# strategy: +# matrix: +# r-version: ['4.4.0', '4.3.2'] +# fail-fast: false +# +# steps: +# - uses: actions/checkout@v3 +# +# - name: Update Homebrew +# run: brew update +# +# - name: Install pandoc +# run: | +# for i in {1..3} +# do +# brew install pandoc && break || sleep 15 +# done +# +# - name: Install gfortran and configure Makevars +# run: | +# brew install gcc +# mkdir -p ~/.R +# touch ~/.R/Makevars +# echo "FC=$(brew --prefix)/bin/gfortran" >> ~/.R/Makevars +# echo "F77=$(brew --prefix)/bin/gfortran" >> ~/.R/Makevars +# echo "FLIBS=-L$(brew --prefix)/lib/gcc/current -lgfortran -lquadmath -lm" >> ~/.R/Makevars +# echo "LDFLAGS=-L$(brew --prefix)/lib/gcc/current" >> ~/.R/Makevars +# +# - name: Set up R ${{ matrix.r-version }} +# uses: r-lib/actions/setup-r@v2 +# with: +# r-version: ${{ matrix.r-version }} +# +# - name: Install R dependencies +# run: | +# # 基礎包安裝 +# Rscript -e 'install.packages(c("remotes", "rcmdcheck", "reticulate", "renv", "knitr", "rmarkdown", "lsa", "purrr", "testthat"), repos="https://cran.r-project.org")' +# +# # 根據 R 版本有條件地安裝 Matrix +# Rscript -e ' +# if (getRversion() >= "4.4.0") { +# install.packages("Matrix") +# } else { +# remotes::install_version("Matrix", version = "1.5.1", repos = "https://cran.r-project.org") +# } +# ' +# +# # 安裝指定版本的 htmltools +# Rscript -e 'remotes::install_version("htmltools", version = "0.5.8")' +# +# # 最後執行 renv::restore() +# Rscript -e 'renv::restore()' +# +# - name: Set up Python +# uses: actions/setup-python@v2 +# with: +# python-version: '3.10.x' +# +# - name: Install Python virtualenv +# run: pip install virtualenv +# +# - name: Create Python virtual environment +# run: virtualenv flair_env +# +# - name: Install Python dependencies in virtual environment +# run: | +# source flair_env/bin/activate +# pip install --upgrade pip +# pip install scipy==1.12.0 +# pip install flair +# +# - name: Remove Python cache files +# run: find . -name '*.pyc' -delete +# +# - name: Check (with virtual environment) +# run: | +# source flair_env/bin/activate +# R CMD build --no-build-vignettes . +# shell: bash + + name: R-MacOS on: @@ -101,7 +196,7 @@ jobs: runs-on: macos-latest strategy: matrix: - r-version: ['4.4.0', '4.3.2'] + r-version: ['4.4.0', '4.2.3'] # 改為 4.2.3 fail-fast: false steps: @@ -134,28 +229,49 @@ jobs: - name: Install R dependencies run: | - # 基礎包安裝 - Rscript -e 'install.packages(c("remotes", "rcmdcheck", "reticulate", "renv", "knitr", "rmarkdown", "lsa", "purrr", "testthat"), repos="https://cran.r-project.org")' + # 首先安裝基礎包 + Rscript -e 'install.packages(c("remotes", "rcmdcheck", "reticulate", "renv"), repos="https://cran.r-project.org")' - # 根據 R 版本有條件地安裝 Matrix + # 根據 R 版本安裝相應的 Matrix 版本 Rscript -e ' if (getRversion() >= "4.4.0") { - install.packages("Matrix") + remotes::install_version("Matrix", version = "1.5-3") + } else if (getRversion() >= "4.2.0" && getRversion() < "4.3.0") { + remotes::install_version("Matrix", version = "1.4-1") } else { - remotes::install_version("Matrix", version = "1.5.1", repos = "https://cran.r-project.org") + remotes::install_version("Matrix", version = "1.5-1") } ' - # 安裝指定版本的 htmltools + # 安裝其他依賴 + Rscript -e 'install.packages(c("knitr", "rmarkdown", "lsa", "purrr", "testthat"), repos="https://cran.r-project.org")' + + # 安裝特定版本的 htmltools Rscript -e 'remotes::install_version("htmltools", version = "0.5.8")' - # 最後執行 renv::restore() - Rscript -e 'renv::restore()' + # renv 相關操作 + Rscript -e ' + tryCatch({ + renv::restore() + }, error = function(e) { + message("Error in renv::restore(): ", e$message) + if (!requireNamespace("Matrix", quietly = TRUE)) { + if (getRversion() >= "4.4.0") { + install.packages("Matrix") + } else if (getRversion() >= "4.2.0" && getRversion() < "4.3.0") { + remotes::install_version("Matrix", version = "1.4-1") + } else { + remotes::install_version("Matrix", version = "1.5-1") + } + } + }) + ' - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 # 更新到 v4 with: python-version: '3.10.x' + cache: 'pip' - name: Install Python virtualenv run: pip install virtualenv @@ -169,6 +285,7 @@ jobs: pip install --upgrade pip pip install scipy==1.12.0 pip install flair + deactivate - name: Remove Python cache files run: find . -name '*.pyc' -delete @@ -176,5 +293,6 @@ jobs: - name: Check (with virtual environment) run: | source flair_env/bin/activate + set -e R CMD build --no-build-vignettes . shell: bash From 0e909756655c66e332306c1e62f2f1bde67b9f65 Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sat, 28 Dec 2024 15:42:28 +0000 Subject: [PATCH 23/46] Update r_ubuntu.yaml --- .github/workflows/r_ubuntu.yaml | 57 ++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 11 deletions(-) diff --git a/.github/workflows/r_ubuntu.yaml b/.github/workflows/r_ubuntu.yaml index c1ad187b..c4cd07c1 100644 --- a/.github/workflows/r_ubuntu.yaml +++ b/.github/workflows/r_ubuntu.yaml @@ -215,24 +215,55 @@ jobs: - name: Install R Matrix package run: | install.packages('remotes') - if (getRversion() >= "4.3.0") { - remotes::install_version("Matrix", version = "1.5-1") - } else { - remotes::install_version("Matrix", version = "1.4-1") + tryCatch({ + if (getRversion() >= "4.3.0") { + remotes::install_version("Matrix", version = "1.5-1", repos = "https://cran.r-project.org") + } else if (getRversion() >= "4.2.0" && getRversion() < "4.3.0") { + remotes::install_version("Matrix", version = "1.4-1", repos = "https://cran.r-project.org", force = TRUE) + } else { + remotes::install_version("Matrix", version = "1.3-4", repos = "https://cran.r-project.org") + } + }, error = function(e) { + message("Error installing Matrix: ", e$message) + if (getRversion() < "4.3.0") { + remotes::install_version("Matrix", version = "1.4-1", repos = "https://cran.r-project.org", force = TRUE) + } + }) + shell: Rscript {0} + + - name: Verify Matrix Installation + run: | + if (!requireNamespace("Matrix", quietly = TRUE)) { + message("Matrix package not installed properly") + quit(status = 1) } + message("Matrix version: ", packageVersion("Matrix")) + shell: Rscript {0} + + - name: Install base R dependencies + run: | + install.packages(c("remotes", "rcmdcheck", "reticulate", "renv", "knitr", + "rmarkdown", "lsa", "purrr", "testthat", "ggplot2"), + repos = "https://cran.r-project.org") shell: Rscript {0} - name: Restore R environment run: | if (!requireNamespace('renv', quietly = TRUE)) install.packages('renv') - renv::restore() + options(warn = 2) + tryCatch({ + renv::restore() + }, error = function(e) { + message("Error in renv::restore(): ", e$message) + # Manual installation fallback + install.packages(c("knitr", "rmarkdown", "lsa", "purrr", "ggplot2")) + }) shell: Rscript {0} - - name: Install additional R packages + - name: Install flaiR package env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - install.packages(c("knitr", "rmarkdown", "lsa", "purrr", "ggplot2")) tryCatch({ remotes::install_github("davidycliao/flaiR", auth_token = Sys.getenv("GITHUB_TOKEN"), @@ -261,21 +292,25 @@ jobs: source flair_env/bin/activate pip install --upgrade pip pip install scipy==1.12.0 - pip install flair - pip install gensim + pip install flair==0.13.0 + pip install gensim==4.3.2 deactivate - name: Remove Python cache files run: find . -name '*.pyc' -delete - name: Check R environment status - run: renv::status() + run: | + message("R version: ", R.version.string) + message("Matrix version: ", packageVersion("Matrix")) + renv::status() shell: Rscript {0} - - name: Check R package (with virtual environment) + - name: Check R package run: | source flair_env/bin/activate set -e R CMD build . --no-build-vignettes R CMD check *tar.gz --no-build-vignettes --no-manual --no-tests --no-examples || true + deactivate shell: bash From b2d4109b8908984594effab54f05235b7d215a5d Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sat, 28 Dec 2024 16:13:36 +0000 Subject: [PATCH 24/46] Update r_ubuntu.yaml --- .github/workflows/r_ubuntu.yaml | 143 ++++++++++++++++++-------------- 1 file changed, 79 insertions(+), 64 deletions(-) diff --git a/.github/workflows/r_ubuntu.yaml b/.github/workflows/r_ubuntu.yaml index c4cd07c1..d8f12a10 100644 --- a/.github/workflows/r_ubuntu.yaml +++ b/.github/workflows/r_ubuntu.yaml @@ -164,7 +164,6 @@ # # R CMD check *tar.gz --no-build-vignettes --no-manual --no-tests --no-examples # # shell: bash - name: R-ubuntu on: @@ -194,6 +193,15 @@ jobs: steps: - uses: actions/checkout@v3 + # 添加磁盤清理步驟 + - name: Free Disk Space + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /usr/local/lib/android + sudo rm -rf /opt/ghc + sudo rm -rf /opt/hostedtoolcache/CodeQL + df -h + - name: Cache R dependencies uses: actions/cache@v3 with: @@ -207,109 +215,116 @@ jobs: use-public-rspm: true r-version: ${{ matrix.r-version }} - - name: Install system dependencies - run: | - sudo apt-get update - sudo apt-get install -y libcurl4-openssl-dev libssl-dev libxml2-dev libfontconfig1-dev libharfbuzz-dev libfribidi-dev libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev - + # Install Matrix first with version control - name: Install R Matrix package run: | install.packages('remotes') + version <- getRversion() tryCatch({ - if (getRversion() >= "4.3.0") { + if (version >= "4.3.2") { remotes::install_version("Matrix", version = "1.5-1", repos = "https://cran.r-project.org") - } else if (getRversion() >= "4.2.0" && getRversion() < "4.3.0") { + } else if (version >= "4.2.0" && version < "4.3.0") { remotes::install_version("Matrix", version = "1.4-1", repos = "https://cran.r-project.org", force = TRUE) - } else { + } else if (version >= "4.1.0" && version < "4.2.0") { remotes::install_version("Matrix", version = "1.3-4", repos = "https://cran.r-project.org") } + message(sprintf("R version: %s, Matrix version: %s", version, packageVersion("Matrix"))) }, error = function(e) { message("Error installing Matrix: ", e$message) - if (getRversion() < "4.3.0") { - remotes::install_version("Matrix", version = "1.4-1", repos = "https://cran.r-project.org", force = TRUE) + if (version < "4.3.0") { + message("Attempting fallback installation...") + remotes::install_version("Matrix", version = "1.4-1", + repos = "https://cran.r-project.org", + force = TRUE) } }) shell: Rscript {0} + # Verify Matrix installation - name: Verify Matrix Installation run: | - if (!requireNamespace("Matrix", quietly = TRUE)) { - message("Matrix package not installed properly") - quit(status = 1) - } - message("Matrix version: ", packageVersion("Matrix")) - shell: Rscript {0} - - - name: Install base R dependencies - run: | - install.packages(c("remotes", "rcmdcheck", "reticulate", "renv", "knitr", - "rmarkdown", "lsa", "purrr", "testthat", "ggplot2"), - repos = "https://cran.r-project.org") - shell: Rscript {0} - - - name: Restore R environment - run: | - if (!requireNamespace('renv', quietly = TRUE)) install.packages('renv') - options(warn = 2) tryCatch({ - renv::restore() - }, error = function(e) { - message("Error in renv::restore(): ", e$message) - # Manual installation fallback - install.packages(c("knitr", "rmarkdown", "lsa", "purrr", "ggplot2")) - }) - shell: Rscript {0} + if (!requireNamespace("Matrix", quietly = TRUE)) { + message("Matrix package not installed properly") + quit(status = 1) + } + version <- as.character(packageVersion("Matrix")) + r_version <- getRversion() + message("Current configuration:") + message("R version: ", r_version) + message("Matrix version: ", version) - - name: Install flaiR package - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - tryCatch({ - remotes::install_github("davidycliao/flaiR", - auth_token = Sys.getenv("GITHUB_TOKEN"), - force = TRUE, - dependencies = TRUE) + # 驗證版本相容性 + if (r_version >= "4.3.2" && version != "1.5-1") { + message("Warning: Unexpected Matrix version for R 4.3.2+") + } else if (r_version >= "4.2.0" && r_version < "4.3.0" && version != "1.4-1") { + message("Warning: Unexpected Matrix version for R 4.2.x") + } else if (r_version >= "4.1.0" && r_version < "4.2.0" && version != "1.3-4") { + message("Warning: Unexpected Matrix version for R 4.1.x") + } }, error = function(e) { - message("Error installing flaiR: ", e$message) + message("Error during Matrix verification: ", e$message) quit(status = 1) }) shell: Rscript {0} + # 優化 Python 虛擬環境設置 - name: Set up Python uses: actions/setup-python@v4 with: python-version: '3.10.x' cache: 'pip' - - name: Install Python virtualenv - run: pip install virtualenv - - - name: Create Python virtual environment - run: virtualenv flair_env - - - name: Install Python dependencies in virtual environment + - name: Create and configure venv run: | + python -m venv flair_env source flair_env/bin/activate - pip install --upgrade pip - pip install scipy==1.12.0 - pip install flair==0.13.0 - pip install gensim==4.3.2 + # 分步安裝以減少內存使用 + pip install --no-cache-dir scipy==1.12.0 + pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu + pip install --no-cache-dir flair + pip install --no-cache-dir gensim==4.3.2 deactivate - - name: Remove Python cache files - run: find . -name '*.pyc' -delete + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends \ + libcurl4-openssl-dev \ + libssl-dev \ + libxml2-dev \ + libfontconfig1-dev \ + libharfbuzz-dev \ + libfribidi-dev \ + libfreetype6-dev \ + libpng-dev \ + libtiff5-dev \ + libjpeg-dev + + - name: Install base R dependencies + run: | + options(repos = c(CRAN = "https://cloud.r-project.org")) + install.packages(c("remotes", "rcmdcheck", "reticulate", "renv", "knitr", + "rmarkdown", "lsa", "purrr", "testthat", "ggplot2")) + shell: Rscript {0} - - name: Check R environment status + - name: Restore R environment run: | - message("R version: ", R.version.string) - message("Matrix version: ", packageVersion("Matrix")) - renv::status() + if (!requireNamespace('renv', quietly = TRUE)) install.packages('renv') + options(warn = 2) + tryCatch({ + renv::restore() + }, error = function(e) { + message("Error in renv::restore(): ", e$message) + install.packages(c("knitr", "rmarkdown", "lsa", "purrr", "ggplot2")) + }) shell: Rscript {0} - name: Check R package run: | source flair_env/bin/activate - set -e + export TMPDIR="/tmp/R-pkg-tmp" + mkdir -p $TMPDIR R CMD build . --no-build-vignettes R CMD check *tar.gz --no-build-vignettes --no-manual --no-tests --no-examples || true deactivate From 363628b492996532abc25da8d0fa077ef6d87edc Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sat, 28 Dec 2024 16:26:31 +0000 Subject: [PATCH 25/46] Update r_ubuntu.yaml --- .github/workflows/r_ubuntu.yaml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/.github/workflows/r_ubuntu.yaml b/.github/workflows/r_ubuntu.yaml index d8f12a10..6f1fdecf 100644 --- a/.github/workflows/r_ubuntu.yaml +++ b/.github/workflows/r_ubuntu.yaml @@ -268,17 +268,25 @@ jobs: }) shell: Rscript {0} - # 優化 Python 虛擬環境設置 - name: Set up Python uses: actions/setup-python@v4 with: python-version: '3.10.x' - cache: 'pip' + + - name: Setup pip cache + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- - name: Create and configure venv run: | python -m venv flair_env source flair_env/bin/activate + mkdir -p ~/.cache/pip + pip install --upgrade pip # 分步安裝以減少內存使用 pip install --no-cache-dir scipy==1.12.0 pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu From 9af523f5eb54552e3d3573b401946a910f61d89c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 28 Dec 2024 16:34:50 +0000 Subject: [PATCH 26/46] Bump jinja2 from 3.1.4 to 3.1.5 Bumps [jinja2](https://github.com/pallets/jinja) from 3.1.4 to 3.1.5. - [Release notes](https://github.com/pallets/jinja/releases) - [Changelog](https://github.com/pallets/jinja/blob/main/CHANGES.rst) - [Commits](https://github.com/pallets/jinja/compare/3.1.4...3.1.5) --- updated-dependencies: - dependency-name: jinja2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 48d31411..3e14ca65 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,7 +23,7 @@ hyperopt==0.2.7 idna==3.10 imageio==2.35.1 Janome==0.5.0 -Jinja2==3.1.4 +Jinja2==3.1.5 jmespath==1.0.1 joblib==1.3.2 kiwisolver==1.4.7 From 7b2c9345b3f26cf1401476f8947848a19e4fb0ed Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sat, 28 Dec 2024 16:37:13 +0000 Subject: [PATCH 27/46] Update r.yml --- .github/workflows/r.yml | 205 +++++++++++++++++++++++++++++++--------- 1 file changed, 162 insertions(+), 43 deletions(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 6ef0e956..9c618766 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -71,6 +71,106 @@ # with: # upload-snapshots: true # +# +# on: +# push: +# branches: [main, master] +# pull_request: +# branches: [main, master] +# +# name: R-CMD-check +# +# jobs: +# R-CMD-check: +# runs-on: ${{ matrix.config.os }} +# +# name: ${{ matrix.config.os }} (${{ matrix.config.r }}) +# +# strategy: +# fail-fast: false +# matrix: +# config: +# - {os: macos-latest, r: 'release'} +# - {os: windows-latest, r: 'release'} +# - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} +# - {os: ubuntu-latest, r: 'release'} +# env: +# GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} +# R_KEEP_PKG_SOURCE: yes +# +# steps: +# - uses: actions/checkout@v2 +# +# - uses: r-lib/actions/setup-pandoc@v2 +# +# - name: Setup Python +# uses: actions/setup-python@v2 +# with: +# python-version: '3.9' # Ensure Python 3.x is being used +# +# - name: Check Python Version +# run: | +# python --version +# +# - name: Install Python dependencies +# run: | +# python -m pip install --upgrade pip +# pip install flair +# +# - name: Setup Python (Only on ubuntu-latest) +# if: matrix.config.os == 'ubuntu-latest' +# uses: actions/setup-python@v2 +# with: +# python-version: '3.10.13' +# +# - name: Install Python venv and dependencies (Only on ubuntu-latest) +# if: matrix.config.os == 'ubuntu-latest' +# run: | +# sudo apt-get update +# sudo apt-get install -y python3-venv +# python -m venv ~/.venv +# echo "RETICULATE_PYTHON=~/.venv/bin/python" >> $GITHUB_ENV +# source ~/.venv/bin/activate +# +# - uses: r-lib/actions/setup-r@v2 +# with: +# r-version: ${{ matrix.config.r }} +# http-user-agent: ${{ matrix.config.http-user-agent }} +# use-public-rspm: true +# +# - name: Install reticulate (Only on ubuntu-latest) +# if: matrix.config.os == 'ubuntu-latest' +# run: | +# Rscript -e "install.packages('reticulate', repos = 'https://cloud.r-project.org/')" +# +# - name: Install Pandoc (Only on Windows) +# if: matrix.config.os == 'windows-latest' +# run: | +# choco install pandoc +# +# - name: Install Python dependencies (Only on Windows) +# if: matrix.config.os == 'windows-latest' +# run: | +# python -m pip install --upgrade pip +# pip install scipy==1.12.0 # test +# pip install flair +# +# - name: Install Python dependencies (Only on macOS) +# if: matrix.config.os == 'macos-latest' +# run: | +# python -m pip install --upgrade pip +# pip install scipy==1.12.0 # test +# pip install flair +# +# - uses: r-lib/actions/setup-r-dependencies@v2 +# with: +# extra-packages: rcmdcheck +# +# # - uses: r-lib/actions/check-r-package@v2 +# # with: +# # upload-snapshots: true + +name: R-CMD-check on: push: @@ -78,22 +178,20 @@ on: pull_request: branches: [main, master] -name: R-CMD-check - jobs: R-CMD-check: runs-on: ${{ matrix.config.os }} - name: ${{ matrix.config.os }} (${{ matrix.config.r }}) strategy: fail-fast: false matrix: config: - - {os: macos-latest, r: 'release'} + - {os: macos-latest, r: 'release'} - {os: windows-latest, r: 'release'} - - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - - {os: ubuntu-latest, r: 'release'} + - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} + - {os: ubuntu-latest, r: 'release'} + env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} R_KEEP_PKG_SOURCE: yes @@ -103,31 +201,50 @@ jobs: - uses: r-lib/actions/setup-pandoc@v2 + # Install system dependencies for Ubuntu + - name: Install system dependencies (Ubuntu) + if: runner.os == 'Linux' + run: | + sudo apt-get update + sudo apt-get install -y \ + libcurl4-openssl-dev \ + libssl-dev \ + libxml2-dev \ + libfontconfig1-dev \ + libharfbuzz-dev \ + libfribidi-dev \ + libfreetype6-dev \ + libpng-dev \ + libtiff5-dev \ + libjpeg-dev \ + python3-venv \ + libgit2-dev \ + libssh2-1-dev + - name: Setup Python uses: actions/setup-python@v2 with: - python-version: '3.9' # Ensure Python 3.x is being used + python-version: '3.10.13' - - name: Check Python Version - run: | - python --version + # Setup pip cache + - name: Setup pip cache + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- - name: Install Python dependencies run: | python -m pip install --upgrade pip + pip install scipy==1.12.0 pip install flair + python --version - - name: Setup Python (Only on ubuntu-latest) - if: matrix.config.os == 'ubuntu-latest' - uses: actions/setup-python@v2 - with: - python-version: '3.10.13' - - - name: Install Python venv and dependencies (Only on ubuntu-latest) - if: matrix.config.os == 'ubuntu-latest' + - name: Setup Python venv (Ubuntu) + if: runner.os == 'Linux' run: | - sudo apt-get update - sudo apt-get install -y python3-venv python -m venv ~/.venv echo "RETICULATE_PYTHON=~/.venv/bin/python" >> $GITHUB_ENV source ~/.venv/bin/activate @@ -138,34 +255,36 @@ jobs: http-user-agent: ${{ matrix.config.http-user-agent }} use-public-rspm: true - - name: Install reticulate (Only on ubuntu-latest) - if: matrix.config.os == 'ubuntu-latest' - run: | - Rscript -e "install.packages('reticulate', repos = 'https://cloud.r-project.org/')" - - - name: Install Pandoc (Only on Windows) - if: matrix.config.os == 'windows-latest' - run: | - choco install pandoc - - - name: Install Python dependencies (Only on Windows) - if: matrix.config.os == 'windows-latest' + - name: Install Matrix package run: | - python -m pip install --upgrade pip - pip install scipy==1.12.0 # test - pip install flair + install.packages('remotes') + version <- getRversion() + tryCatch({ + if (version >= "4.3.2") { + remotes::install_version("Matrix", version = "1.5-1") + } else if (version >= "4.2.0" && version < "4.3.0") { + remotes::install_version("Matrix", version = "1.4-1", force = TRUE) + } else { + remotes::install_version("Matrix", version = "1.3-4") + } + }, error = function(e) { + message("Error installing Matrix: ", e$message) + }) + shell: Rscript {0} - - name: Install Python dependencies (Only on macOS) - if: matrix.config.os == 'macos-latest' + - name: Install R dependencies run: | - python -m pip install --upgrade pip - pip install scipy==1.12.0 # test - pip install flair + install.packages(c('reticulate', 'png', 'jsonlite')) + shell: Rscript {0} - uses: r-lib/actions/setup-r-dependencies@v2 with: extra-packages: rcmdcheck - # - uses: r-lib/actions/check-r-package@v2 - # with: - # upload-snapshots: true + - name: Check R package + env: + _R_CHECK_CRAN_INCOMING_: false + run: | + options(crayon.enabled = TRUE) + rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") + shell: Rscript {0} From a15736e72e66a3c6405bc5fd4402b2f1289f4d0d Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sat, 28 Dec 2024 16:41:18 +0000 Subject: [PATCH 28/46] Update r.yml --- .github/workflows/r.yml | 233 +++++++++++++++++++++++----------------- 1 file changed, 134 insertions(+), 99 deletions(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 9c618766..a877b00a 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -173,118 +173,153 @@ name: R-CMD-check on: - push: - branches: [main, master] - pull_request: - branches: [main, master] + push: + branches: [main, master] + pull_request: + branches: [main, master] jobs: - R-CMD-check: - runs-on: ${{ matrix.config.os }} - name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + R-CMD-check: + runs-on: ${{ matrix.config.os }} + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) - strategy: - fail-fast: false - matrix: - config: - - {os: macos-latest, r: 'release'} - - {os: windows-latest, r: 'release'} - - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - - {os: ubuntu-latest, r: 'release'} + strategy: + fail-fast: false + matrix: + config: + - {os: macos-latest, r: 'release'} + - {os: windows-latest, r: 'release'} + - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} + - {os: ubuntu-latest, r: 'release'} - env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - R_KEEP_PKG_SOURCE: yes + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + R_KEEP_PKG_SOURCE: yes - steps: - - uses: actions/checkout@v2 + steps: + - uses: actions/checkout@v2 - - uses: r-lib/actions/setup-pandoc@v2 + - uses: r-lib/actions/setup-pandoc@v2 - # Install system dependencies for Ubuntu - - name: Install system dependencies (Ubuntu) - if: runner.os == 'Linux' - run: | - sudo apt-get update - sudo apt-get install -y \ - libcurl4-openssl-dev \ - libssl-dev \ - libxml2-dev \ - libfontconfig1-dev \ - libharfbuzz-dev \ - libfribidi-dev \ - libfreetype6-dev \ - libpng-dev \ - libtiff5-dev \ - libjpeg-dev \ - python3-venv \ - libgit2-dev \ - libssh2-1-dev + # Install system dependencies for Ubuntu + - name: Install system dependencies (Ubuntu) + if: runner.os == 'Linux' + run: | + sudo apt-get update + sudo apt-get install -y \ + libcurl4-openssl-dev \ + libssl-dev \ + libxml2-dev \ + libfontconfig1-dev \ + libharfbuzz-dev \ + libfribidi-dev \ + libfreetype6-dev \ + libpng-dev \ + libtiff5-dev \ + libjpeg-dev \ + python3-venv \ + libgit2-dev \ + libssh2-1-dev \ + pkg-config - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: '3.10.13' + # 驗證 libpng 安裝 + - name: Verify libpng installation + if: runner.os == 'Linux' + run: | + pkg-config --libs libpng + ls -l /usr/include/png.h + ls -l /usr/lib/x86_64-linux-gnu/libpng* - # Setup pip cache - - name: Setup pip cache - uses: actions/cache@v3 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} - restore-keys: | - ${{ runner.os }}-pip- + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: '3.10.13' - - name: Install Python dependencies - run: | - python -m pip install --upgrade pip - pip install scipy==1.12.0 - pip install flair - python --version + # Setup pip cache + - name: Setup pip cache + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- - - name: Setup Python venv (Ubuntu) - if: runner.os == 'Linux' - run: | - python -m venv ~/.venv - echo "RETICULATE_PYTHON=~/.venv/bin/python" >> $GITHUB_ENV - source ~/.venv/bin/activate + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install scipy==1.12.0 + pip install flair + python --version - - uses: r-lib/actions/setup-r@v2 - with: - r-version: ${{ matrix.config.r }} - http-user-agent: ${{ matrix.config.http-user-agent }} - use-public-rspm: true + - name: Setup Python venv (Ubuntu) + if: runner.os == 'Linux' + run: | + python -m venv ~/.venv + echo "RETICULATE_PYTHON=~/.venv/bin/python" >> $GITHUB_ENV + source ~/.venv/bin/activate - - name: Install Matrix package - run: | - install.packages('remotes') - version <- getRversion() - tryCatch({ - if (version >= "4.3.2") { - remotes::install_version("Matrix", version = "1.5-1") - } else if (version >= "4.2.0" && version < "4.3.0") { - remotes::install_version("Matrix", version = "1.4-1", force = TRUE) - } else { - remotes::install_version("Matrix", version = "1.3-4") - } - }, error = function(e) { - message("Error installing Matrix: ", e$message) - }) - shell: Rscript {0} + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.config.r }} + http-user-agent: ${{ matrix.config.http-user-agent }} + use-public-rspm: true - - name: Install R dependencies - run: | - install.packages(c('reticulate', 'png', 'jsonlite')) - shell: Rscript {0} + - name: Install Matrix package + run: | + install.packages('remotes') + version <- getRversion() + tryCatch({ + if (version >= "4.3.2") { + remotes::install_version("Matrix", version = "1.5-1") + } else if (version >= "4.2.0" && version < "4.3.0") { + remotes::install_version("Matrix", version = "1.4-1", force = TRUE) + } else { + remotes::install_version("Matrix", version = "1.3-4") + } + message(sprintf("Matrix version installed: %s", packageVersion("Matrix"))) + }, error = function(e) { + message("Error installing Matrix: ", e$message) + quit(status = 1) + }) + shell: Rscript {0} - - uses: r-lib/actions/setup-r-dependencies@v2 - with: - extra-packages: rcmdcheck + - name: Install R packages with detailed logging + run: | + options(warn = 2) + tryCatch({ + # 首先嘗試安裝 png + install.packages('png', + type = "source", + repos = "https://cloud.r-project.org/", + configure.args = "--with-libpng-prefix=/usr") - - name: Check R package - env: - _R_CHECK_CRAN_INCOMING_: false - run: | - options(crayon.enabled = TRUE) - rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") - shell: Rscript {0} + # 然後安裝其他包 + install.packages(c('reticulate', 'jsonlite', 'rmarkdown', 'knitr')) + }, error = function(e) { + message("Error during package installation: ", e$message) + system("pkg-config --libs libpng") + system("ls -l /usr/include/png.h") + quit(status = 1) + }) + shell: Rscript {0} + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: rcmdcheck + + - name: Session info + run: | + options(width = 100) + pkgs <- installed.packages()[, "Package"] + sessioninfo::session_info(pkgs, include_base = TRUE) + shell: Rscript {0} + + - name: Check R package + env: + _R_CHECK_CRAN_INCOMING_: false + run: | + options(crayon.enabled = TRUE) + rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), + error_on = "warning", + check_dir = "check") + shell: Rscript {0} From ac4fc0194a045a0d898080772ab15f60c2ef0e3e Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sat, 28 Dec 2024 16:48:43 +0000 Subject: [PATCH 29/46] Update r.yml --- .github/workflows/r.yml | 49 ++--------------------------------------- 1 file changed, 2 insertions(+), 47 deletions(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index a877b00a..05e78f0e 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -189,8 +189,8 @@ jobs: config: - {os: macos-latest, r: 'release'} - {os: windows-latest, r: 'release'} - - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - - {os: ubuntu-latest, r: 'release'} + #- {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} + #- {os: ubuntu-latest, r: 'release'} env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} @@ -201,35 +201,6 @@ jobs: - uses: r-lib/actions/setup-pandoc@v2 - # Install system dependencies for Ubuntu - - name: Install system dependencies (Ubuntu) - if: runner.os == 'Linux' - run: | - sudo apt-get update - sudo apt-get install -y \ - libcurl4-openssl-dev \ - libssl-dev \ - libxml2-dev \ - libfontconfig1-dev \ - libharfbuzz-dev \ - libfribidi-dev \ - libfreetype6-dev \ - libpng-dev \ - libtiff5-dev \ - libjpeg-dev \ - python3-venv \ - libgit2-dev \ - libssh2-1-dev \ - pkg-config - - # 驗證 libpng 安裝 - - name: Verify libpng installation - if: runner.os == 'Linux' - run: | - pkg-config --libs libpng - ls -l /usr/include/png.h - ls -l /usr/lib/x86_64-linux-gnu/libpng* - - name: Setup Python uses: actions/setup-python@v2 with: @@ -251,13 +222,6 @@ jobs: pip install flair python --version - - name: Setup Python venv (Ubuntu) - if: runner.os == 'Linux' - run: | - python -m venv ~/.venv - echo "RETICULATE_PYTHON=~/.venv/bin/python" >> $GITHUB_ENV - source ~/.venv/bin/activate - - uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.config.r }} @@ -287,18 +251,9 @@ jobs: run: | options(warn = 2) tryCatch({ - # 首先嘗試安裝 png - install.packages('png', - type = "source", - repos = "https://cloud.r-project.org/", - configure.args = "--with-libpng-prefix=/usr") - - # 然後安裝其他包 install.packages(c('reticulate', 'jsonlite', 'rmarkdown', 'knitr')) }, error = function(e) { message("Error during package installation: ", e$message) - system("pkg-config --libs libpng") - system("ls -l /usr/include/png.h") quit(status = 1) }) shell: Rscript {0} From 0e2238b5c66d5400a32ac4ef371c609bdf5f0d5f Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sat, 28 Dec 2024 16:54:48 +0000 Subject: [PATCH 30/46] Delete r2.yml --- .github/workflows/r2.yml | 48 ---------------------------------------- 1 file changed, 48 deletions(-) delete mode 100644 .github/workflows/r2.yml diff --git a/.github/workflows/r2.yml b/.github/workflows/r2.yml deleted file mode 100644 index 6509636d..00000000 --- a/.github/workflows/r2.yml +++ /dev/null @@ -1,48 +0,0 @@ -# This workflow uses actions that are not certified by GitHub. -# They are provided by a third-party and are governed by -# separate terms of service, privacy policy, and support -# documentation. -# -# See https://github.com/r-lib/actions/tree/master/examples#readme for -# additional example workflows available for the R community. - -name: R - -on: - push: - branches: [ "main" ] - pull_request: - branches: [ "main" ] - -permissions: - contents: read - -jobs: - build: - runs-on: macos-latest - strategy: - matrix: - r-version: ['4.0.0', '4.1.0', '4.1.1'] - steps: - - uses: actions/checkout@v3 - - - name: Install pandoc - run: | - brew install pandoc - - - name: Set up R ${{ matrix.r-version }} - uses: r-lib/actions/setup-r@v1 - with: - r-version: ${{ matrix.r-version }} - - - name: Install dependencies - run: | - install.packages(c("remotes", "rcmdcheck")) - remotes::install_deps(dependencies = TRUE) - shell: Rscript {0} - - - name: Check - run: | - R CMD build --no-build-vignettes . - shell: bash - From ee25717081e078d185179505dee700df10f44ea5 Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sat, 28 Dec 2024 17:02:05 +0000 Subject: [PATCH 31/46] Update r.yml --- .github/workflows/r.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 05e78f0e..0dda42ac 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -170,6 +170,7 @@ # # with: # # upload-snapshots: true + name: R-CMD-check on: @@ -189,8 +190,7 @@ jobs: config: - {os: macos-latest, r: 'release'} - {os: windows-latest, r: 'release'} - #- {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - #- {os: ubuntu-latest, r: 'release'} + env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} From d42e9925d79a6ce49b4d75515e421c9e272f0274 Mon Sep 17 00:00:00 2001 From: David Liao <35374714+davidycliao@users.noreply.github.com> Date: Sat, 28 Dec 2024 17:03:59 +0000 Subject: [PATCH 32/46] Update r.yml --- .github/workflows/r.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 6ef0e956..3b064018 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -92,8 +92,7 @@ jobs: config: - {os: macos-latest, r: 'release'} - {os: windows-latest, r: 'release'} - - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - - {os: ubuntu-latest, r: 'release'} + env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} R_KEEP_PKG_SOURCE: yes From 700e8c7edab18feeb63f46221198afb63c6abb49 Mon Sep 17 00:00:00 2001 From: David Liao <35374714+davidycliao@users.noreply.github.com> Date: Sat, 28 Dec 2024 17:33:14 +0000 Subject: [PATCH 33/46] Delete .github/workflows/r2.yml --- .github/workflows/r2.yml | 48 ---------------------------------------- 1 file changed, 48 deletions(-) delete mode 100644 .github/workflows/r2.yml diff --git a/.github/workflows/r2.yml b/.github/workflows/r2.yml deleted file mode 100644 index 6509636d..00000000 --- a/.github/workflows/r2.yml +++ /dev/null @@ -1,48 +0,0 @@ -# This workflow uses actions that are not certified by GitHub. -# They are provided by a third-party and are governed by -# separate terms of service, privacy policy, and support -# documentation. -# -# See https://github.com/r-lib/actions/tree/master/examples#readme for -# additional example workflows available for the R community. - -name: R - -on: - push: - branches: [ "main" ] - pull_request: - branches: [ "main" ] - -permissions: - contents: read - -jobs: - build: - runs-on: macos-latest - strategy: - matrix: - r-version: ['4.0.0', '4.1.0', '4.1.1'] - steps: - - uses: actions/checkout@v3 - - - name: Install pandoc - run: | - brew install pandoc - - - name: Set up R ${{ matrix.r-version }} - uses: r-lib/actions/setup-r@v1 - with: - r-version: ${{ matrix.r-version }} - - - name: Install dependencies - run: | - install.packages(c("remotes", "rcmdcheck")) - remotes::install_deps(dependencies = TRUE) - shell: Rscript {0} - - - name: Check - run: | - R CMD build --no-build-vignettes . - shell: bash - From 263cbc4dc87d591f0f33c1307466e853432d2769 Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sat, 28 Dec 2024 22:42:11 +0000 Subject: [PATCH 34/46] UPDATE 0.0.7 tutorial --- _pkgdown.yml | 10 +++--- vignettes/tutorial.Rmd | 72 ++++++++++++++++++++++++++++++------------ 2 files changed, 57 insertions(+), 25 deletions(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index d3923e99..eec0482c 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -277,12 +277,12 @@ navbar: - icon: fa-rocket text: Quick Start menu: + - text: "flaiR Installation" + href: articles/quickstart.html#flair-installation - text: "NLP Tasks" href: articles/quickstart.html#nlp-tasks - - text: "Class and Ojbect" - href: articles/quickstart.html#class-and-ojbect - - text: "More Details about Installation" - href: articles/quickstart.html#more-details-about-installation + - text: "Training and Finetuning" + href: articles/quickstart.html#training-and-fine-tuning - icon: fa-project-diagram @@ -348,6 +348,8 @@ navbar: - icon: fa-newspaper-o text: News menu: + - text: "0.0.7" + href: news/index.html#flair-007-2024-12-26 - text: "0.0.6" href: news/index.html#flair-006-2023-10-29 - text: "0.0.5" diff --git a/vignettes/tutorial.Rmd b/vignettes/tutorial.Rmd index 6ae6f6c5..f7ef0591 100644 --- a/vignettes/tutorial.Rmd +++ b/vignettes/tutorial.Rmd @@ -29,17 +29,20 @@ library(reticulate) reticulate::py_install("flair") ``` -# The Overview + +# Flair NLP and flaiR for Social Science
-**Flair NLP** is an open-source library for Natural Language Processing (NLP) developed by [Zalando Research](https://github.com/zalandoresearch/). Known for its state-of-the-art solutions, such as contextual string embeddings for NLP tasks like Named Entity Recognition (NER), Part-of-Speech tagging (POS), and more, it has garnered the attention of the NLP community for its ease of use and powerful functionalities. +Flair NLP is an open-source Natural Language Processing (NLP) library developed by [Zalando Research](https://github.com/zalandoresearch/). Known for its state-of-the-art solutions, it excels in contextual string embeddings, Named Entity Recognition (NER), and Part-of-Speech tagging (POS). Flair offers robust text analysis tools through multiple embedding approaches, including Flair contextual string embeddings, transformer-based embeddings from Hugging Face, and traditional models like GloVe and fasttext. Additionally, it provides pre-trained models for various languages and seamless integration with fine-tuned transformers hosted on Hugging Face. -In addition, Flair NLP offers pre-trained models for various languages and tasks, and is compatible with fine-tuned transformers hosted on Hugging Face. +flaiR bridges these powerful NLP features from Python to the R environment, making advanced text analysis accessible for social science researcher by combining Flair's ease of use with R's familiar interface for integration with popular R packages such as [quanteda](https://quanteda.io) and more.
-- [**Sentence and Token Object**](#tutorial.html#sentence-and-token) +# The Overview + +- [**Sentence and Token Object in FlaiR**](#tutorial.html#sentence-and-token) - [**Sequence Taggings**](#tutorial.html#sequence-taggings) @@ -55,13 +58,16 @@ In addition, Flair NLP offers pre-trained models for various languages and tasks - [**Extending conText's Embedding Regression**](#tutorial.html#extending-contexts-embedding-regression) ------------------------------------------------------------------------- +  + +----- + # Sentence and Token Sentence and Token are fundamental classes. -## **Sentence** +## Sentence
@@ -85,7 +91,7 @@ print(sentence) ``` -## **Token** +## Token
@@ -183,7 +189,7 @@ print(sentence) [^1]: Flair is built on PyTorch, which is a library in Python. -## **Corpus** +## Corpus The Corpus object in Flair is a fundamental data structure that represents a dataset containing text samples, usually comprising of a training set, a development set (or validation set), and a test set. It's designed to work smoothly with Flair's models for tasks like named entity recognition, text classification, and more. @@ -288,7 +294,10 @@ In the later sections, there will be more similar processing using the `Corpus`.
------------------------------------------------------------------------- +  + +----- + # Sequence Taggings @@ -551,7 +560,9 @@ head(results, n = 10) ``` ------------------------------------------------------------------------- +  + +----- # Embedding @@ -1022,15 +1033,15 @@ figure ``` ------------------------------------------------------------------------- +  +----- -# Training a Binary Classifier +# Training a Binary Classifier In this section, we'll train a sentiment analysis model that can categorize text as either positive or negative. This case study is adapted from pages 116 to 130 of Tadej Magajna's book, '[Natural Language Processing with Flair](https://www.packtpub.com/product/natural-language-processing-with-flair/9781801072311)'. The process for training text classifiers in Flair mirrors the process followed for sequence labeling models. Specifically, the steps to train text classifiers are: - - Load a tagged corpus and compute the label dictionary map. - Prepare the document embeddings. - Initialize the `TextClassifier` class. @@ -1044,7 +1055,6 @@ Training text classification models requires a set of text documents (typically,
- ```{r} library(flaiR) # load IMDB from flair_datasets module @@ -1082,7 +1092,6 @@ flaiR covers all the different types of document embeddings that we can use. Her
- ```{r} DocumentPoolEmbeddings <- flair_embeddings()$DocumentPoolEmbeddings WordEmbeddings <- flair_embeddings()$WordEmbeddings @@ -1195,7 +1204,10 @@ print(sentence$labels) ``` ------------------------------------------------------------------------- +  + +----- + # Training RNNs @@ -1312,7 +1324,6 @@ library(flaiR) ``` - ## Fine-tuning a Transformers Model **Step 1** Load Necessary Modules from Flair @@ -1499,7 +1510,10 @@ More R tutorial and documentation see [here](https://github.com/davidycliao/flai ------------------------------------------------------------------------- +  + +----- + # Extending conText's Embedding Regression @@ -1972,16 +1986,17 @@ bt_model <- conText(formula = immigration ~ party + gender, While this tutorial doesn't determine a definitive best approach, it's important to understand the key distinctions between word embedding methods. BERT, FastText, Flair Stacked Embeddings, and GloVe can be categorized into two groups: dynamic and static embeddings. -Dynamic embeddings, such as BERT and Flair, adapt their word representations based on context using high-dimensional vector spaces (BERT uses 768 dimensions in its base model). BERT employs self-attention mechanisms and subword tokenization, while Flair uses character-level modeling. Both effectively handle out-of-vocabulary words through these mechanisms. +Dynamic embeddings, particularly BERT and Flair, adapt their word representations based on context using high-dimensional vector spaces (BERT uses 768 dimensions in its base model). BERT employs self-attention mechanisms and subword tokenization, while Flair uses character-level modeling. Both effectively handle out-of-vocabulary words through these mechanisms. -However, it's worth noting that in their case study, where they provide selected words, in our case study we directly extract individual word vectors from BERT and Flair (forward/backward) embeddings using those selected words. This approach doesn't truly achieve the intended contextual modeling. A more meaningful approach would be to extract embeddings at the quasi-sentence or paragraph level. Alternatively, pooling the entire document before extracting embeddings could be more valuable. +However, there is a notable difference between their case study and here. While they provide selected words, we directly extract individual word vectors from BERT and Flair (forward/backward) embeddings using the same set of words. This doesn't truly utilize BERT and Flair embeddings' capability of modeling context. A more meaningful approach would be to extract embeddings at the quasi-sentence or paragraph level, or alternatively, to pool the entire document before extracting embeddings. -These context-based approaches differ significantly from GloVe's methodology, which relies on pre-computed global word-word co-occurrence statistics to generate static word vectors. +These context-based approaches stand in stark contrast to GloVe's methodology, which relies on pre-computed global word-word co-occurrence statistics to generate static word vectors. ```{r echo=FALSE, message = TRUE, warning = TRUE, out.width="95%"} + st <- as.data.frame(st_model@normed_coefficients) st["model"] <- "Flair Stacked Embeddings" ft <- as.data.frame(ft_model@normed_coefficients) @@ -2010,5 +2025,20 @@ ggplot(merged_df, aes(x = coefficient, y = normed.estimate)) + facet_wrap(~model, nrow = 2, ncol = 2, scales = "free_y") ``` +  + +----- + +# Cite + +``` +@Manual{, + title = {Flair NLP and flaiR for Social Science}, + author = {Yen-Chieh Liao, Sohini Timbadia and Stefan Müller}, + year = {2024}, + url = {https://davidycliao.github.io/flaiR/articles/tutorial.html} +} +``` + From fd90cf963b64f73085ec675b83feaee5a151e285 Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sun, 29 Dec 2024 03:25:36 +0000 Subject: [PATCH 35/46] UPDATE 0.0.7 clean some type. --- .github/workflows/r.yml | 165 ------------------------------ .github/workflows/r_macos.yml | 174 -------------------------------- .github/workflows/r_ubuntu.yaml | 165 ------------------------------ R/predict_label.R | 114 +++++++++++---------- man/predict_label.Rd | 44 ++++---- vignettes/tutorial.Rmd | 45 ++++++++- 6 files changed, 122 insertions(+), 585 deletions(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 1636412e..a3309209 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -5,171 +5,6 @@ # # See https://github.com/r-lib/actions/tree/master/examples#readme for # additional example workflows available for the R community. -# on: -# push: -# branches: [main, master] -# pull_request: -# branches: [main, master] -# -# name: R-CMD-check -# -# jobs: -# R-CMD-check: -# runs-on: ${{ matrix.config.os }} -# -# name: ${{ matrix.config.os }} (${{ matrix.config.r }}) -# -# strategy: -# fail-fast: false -# matrix: -# config: -# - {os: macos-latest, r: 'release'} -# - {os: windows-latest, r: 'release'} -# - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} -# - {os: ubuntu-latest, r: 'release'} -# env: -# GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} -# R_KEEP_PKG_SOURCE: yes -# -# steps: -# - uses: actions/checkout@v2 -# -# - uses: r-lib/actions/setup-pandoc@v2 -# -# -# - name: Setup Python (Only on ubuntu-latest) -# if: matrix.config.os == 'ubuntu-latest' -# uses: actions/setup-python@v2 -# with: -# python-version: '3.x' -# -# - name: Install Python venv and dependencies (Only on ubuntu-latest) -# if: matrix.config.os == 'ubuntu-latest' -# run: | -# sudo apt-get update -# sudo apt-get install -y python3-venv -# python -m venv ~/.venv -# echo "RETICULATE_PYTHON=~/.venv/bin/python" >> $GITHUB_ENV -# source ~/.venv/bin/activate -# - uses: r-lib/actions/setup-r@v2 -# with: -# r-version: ${{ matrix.config.r }} -# http-user-agent: ${{ matrix.config.http-user-agent }} -# use-public-rspm: true -# -# - name: Install reticulate (Only on ubuntu-latest) -# if: matrix.config.os == 'ubuntu-latest' -# run: | -# Rscript -e "install.packages('reticulate', repos = 'https://cloud.r-project.org/')" -# -# - uses: r-lib/actions/setup-r-dependencies@v2 -# with: -# extra-packages: any::rcmdcheck -# needs: check -# -# - uses: r-lib/actions/check-r-package@v2 -# with: -# upload-snapshots: true -# -# -# on: -# push: -# branches: [main, master] -# pull_request: -# branches: [main, master] -# -# name: R-CMD-check -# -# jobs: -# R-CMD-check: -# runs-on: ${{ matrix.config.os }} -# -# name: ${{ matrix.config.os }} (${{ matrix.config.r }}) -# -# strategy: -# fail-fast: false -# matrix: -# config: -# - {os: macos-latest, r: 'release'} -# - {os: windows-latest, r: 'release'} -# - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} -# - {os: ubuntu-latest, r: 'release'} -# env: -# GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} -# R_KEEP_PKG_SOURCE: yes -# -# steps: -# - uses: actions/checkout@v2 -# -# - uses: r-lib/actions/setup-pandoc@v2 -# -# - name: Setup Python -# uses: actions/setup-python@v2 -# with: -# python-version: '3.9' # Ensure Python 3.x is being used -# -# - name: Check Python Version -# run: | -# python --version -# -# - name: Install Python dependencies -# run: | -# python -m pip install --upgrade pip -# pip install flair -# -# - name: Setup Python (Only on ubuntu-latest) -# if: matrix.config.os == 'ubuntu-latest' -# uses: actions/setup-python@v2 -# with: -# python-version: '3.10.13' -# -# - name: Install Python venv and dependencies (Only on ubuntu-latest) -# if: matrix.config.os == 'ubuntu-latest' -# run: | -# sudo apt-get update -# sudo apt-get install -y python3-venv -# python -m venv ~/.venv -# echo "RETICULATE_PYTHON=~/.venv/bin/python" >> $GITHUB_ENV -# source ~/.venv/bin/activate -# -# - uses: r-lib/actions/setup-r@v2 -# with: -# r-version: ${{ matrix.config.r }} -# http-user-agent: ${{ matrix.config.http-user-agent }} -# use-public-rspm: true -# -# - name: Install reticulate (Only on ubuntu-latest) -# if: matrix.config.os == 'ubuntu-latest' -# run: | -# Rscript -e "install.packages('reticulate', repos = 'https://cloud.r-project.org/')" -# -# - name: Install Pandoc (Only on Windows) -# if: matrix.config.os == 'windows-latest' -# run: | -# choco install pandoc -# -# - name: Install Python dependencies (Only on Windows) -# if: matrix.config.os == 'windows-latest' -# run: | -# python -m pip install --upgrade pip -# pip install scipy==1.12.0 # test -# pip install flair -# -# - name: Install Python dependencies (Only on macOS) -# if: matrix.config.os == 'macos-latest' -# run: | -# python -m pip install --upgrade pip -# pip install scipy==1.12.0 # test -# pip install flair -# -# - uses: r-lib/actions/setup-r-dependencies@v2 -# with: -# extra-packages: rcmdcheck -# -# # - uses: r-lib/actions/check-r-package@v2 -# # with: -# # upload-snapshots: true - name: R-CMD-check diff --git a/.github/workflows/r_macos.yml b/.github/workflows/r_macos.yml index 4c34a6c9..da8705d5 100644 --- a/.github/workflows/r_macos.yml +++ b/.github/workflows/r_macos.yml @@ -5,180 +5,6 @@ # # See https://github.com/r-lib/actions/tree/master/examples#readme for # additional example workflows available for the R community. -# name: R-MacOS -# -# on: -# push: -# branches: [ "main" ] -# pull_request: -# branches: [ "main" ] -# -# permissions: -# contents: read -# -# jobs: -# build: -# runs-on: macos-latest -# -# strategy: -# matrix: -# r-version: ['4.4.0', '4.3.2'] -# -# steps: -# - uses: actions/checkout@v3 -# -# - name: Update Homebrew -# run: | -# brew update -# -# - name: Install pandoc -# run: | -# for i in {1..3}; do -# brew install pandoc && break || sleep 15 -# done -# -# - name: Install gfortran and configure Makevars -# run: | -# brew install gcc -# mkdir -p ~/.R -# touch ~/.R/Makevars -# echo "FC=$(brew --prefix)/bin/gfortran" >> ~/.R/Makevars -# echo "F77=$(brew --prefix)/bin/gfortran" >> ~/.R/Makevars -# echo "FLIBS=-L$(brew --prefix)/lib/gcc/current -lgfortran -lquadmath -lm" >> ~/.R/Makevars -# echo "LDFLAGS=-L$(brew --prefix)/lib/gcc/current" >> ~/.R/Makevars -# -# - name: Set up R ${{ matrix.r-version }} -# uses: r-lib/actions/setup-r@v2 -# with: -# r-version: ${{ matrix.r-version }} -# -# - name: Install R dependencies -# run: | -# Rscript -e "install.packages(c('remotes', 'rcmdcheck', 'reticulate', 'renv', 'knitr', 'rmarkdown', 'lsa', 'purrr', 'testthat', 'htmltools'), repos='https://cran.r-project.org')" -# Rscript -e "if (getRversion() >= '4.4.0') remotes::install_version('Matrix', version = '1.5.3') else install.packages('Matrix', type = 'binary')" -# Rscript -e "remotes::install_version('htmltools', version = '0.5.8')" -# Rscript -e "renv::restore()" -# -# - name: Set up Python -# uses: actions/setup-python@v2 -# with: -# python-version: '3.10.x' -# -# - name: Install Python virtualenv -# run: pip install virtualenv -# -# - name: Create Python virtual environment -# run: virtualenv flair_env -# -# - name: Install Python dependencies in virtual environment -# run: | -# source flair_env/bin/activate -# pip install --upgrade pip -# pip install scipy==1.12.0 -# pip install flair -# -# - name: Remove Python cache files -# run: find . -name '*.pyc' -delete -# -# - name: Check (with virtual environment) -# run: | -# source flair_env/bin/activate -# R CMD build --no-build-vignettes . -# shell: bash -# name: R-MacOS -# -# on: -# push: -# branches: [ "main" ] -# pull_request: -# branches: [ "main" ] -# -# permissions: -# contents: read -# -# jobs: -# build: -# runs-on: macos-latest -# strategy: -# matrix: -# r-version: ['4.4.0', '4.3.2'] -# fail-fast: false -# -# steps: -# - uses: actions/checkout@v3 -# -# - name: Update Homebrew -# run: brew update -# -# - name: Install pandoc -# run: | -# for i in {1..3} -# do -# brew install pandoc && break || sleep 15 -# done -# -# - name: Install gfortran and configure Makevars -# run: | -# brew install gcc -# mkdir -p ~/.R -# touch ~/.R/Makevars -# echo "FC=$(brew --prefix)/bin/gfortran" >> ~/.R/Makevars -# echo "F77=$(brew --prefix)/bin/gfortran" >> ~/.R/Makevars -# echo "FLIBS=-L$(brew --prefix)/lib/gcc/current -lgfortran -lquadmath -lm" >> ~/.R/Makevars -# echo "LDFLAGS=-L$(brew --prefix)/lib/gcc/current" >> ~/.R/Makevars -# -# - name: Set up R ${{ matrix.r-version }} -# uses: r-lib/actions/setup-r@v2 -# with: -# r-version: ${{ matrix.r-version }} -# -# - name: Install R dependencies -# run: | -# # 基礎包安裝 -# Rscript -e 'install.packages(c("remotes", "rcmdcheck", "reticulate", "renv", "knitr", "rmarkdown", "lsa", "purrr", "testthat"), repos="https://cran.r-project.org")' -# -# # 根據 R 版本有條件地安裝 Matrix -# Rscript -e ' -# if (getRversion() >= "4.4.0") { -# install.packages("Matrix") -# } else { -# remotes::install_version("Matrix", version = "1.5.1", repos = "https://cran.r-project.org") -# } -# ' -# -# # 安裝指定版本的 htmltools -# Rscript -e 'remotes::install_version("htmltools", version = "0.5.8")' -# -# # 最後執行 renv::restore() -# Rscript -e 'renv::restore()' -# -# - name: Set up Python -# uses: actions/setup-python@v2 -# with: -# python-version: '3.10.x' -# -# - name: Install Python virtualenv -# run: pip install virtualenv -# -# - name: Create Python virtual environment -# run: virtualenv flair_env -# -# - name: Install Python dependencies in virtual environment -# run: | -# source flair_env/bin/activate -# pip install --upgrade pip -# pip install scipy==1.12.0 -# pip install flair -# -# - name: Remove Python cache files -# run: find . -name '*.pyc' -delete -# -# - name: Check (with virtual environment) -# run: | -# source flair_env/bin/activate -# R CMD build --no-build-vignettes . -# shell: bash - name: R-MacOS diff --git a/.github/workflows/r_ubuntu.yaml b/.github/workflows/r_ubuntu.yaml index 6f1fdecf..4133b3b3 100644 --- a/.github/workflows/r_ubuntu.yaml +++ b/.github/workflows/r_ubuntu.yaml @@ -1,168 +1,3 @@ -# name: R-ubuntu -# -# on: -# push: -# branches: -# - main -# pull_request: -# branches: -# - main -# -# jobs: -# R-CMD-check: -# runs-on: ubuntu-20.04 -# -# strategy: -# matrix: -# r-version: ['4.3.2', '4.2.0', '4.2.1'] -# -# steps: -# - uses: actions/checkout@v3 -# -# - name: Cache R dependencies -# uses: actions/cache@v2 -# with: -# path: ~/R/x86_64-pc-linux-gnu-library/ -# key: ${{ runner.os }}-r-${{ hashFiles('**/renv.lock') }} -# restore-keys: ${{ runner.os }}-r- -# -# - name: Setup R -# uses: r-lib/actions/setup-r@v2 -# with: -# use-public-rspm: true -# -# - name: Restore R environment -# run: | -# Rscript -e "if (!requireNamespace('renv', quietly = TRUE)) install.packages('renv')" -# Rscript -e "renv::restore()" -# -# - name: Install additional R packages -# run: Rscript -e 'install.packages(c("knitr", "rmarkdown", "lsa", "purrr", "ggplot2"))' -# shell: bash -# -# - name: Set up Python -# uses: actions/setup-python@v2 -# with: -# python-version: '3.10.x' -# -# - name: Install Python virtualenv -# run: pip install virtualenv -# -# - name: Create Python virtual environment -# run: virtualenv flair_env -# -# - name: Install Python dependencies in virtual environment -# run: | -# source flair_env/bin/activate -# pip install --upgrade pip -# pip install scipy==1.12.0 # test -# pip install flair -# -# - name: Remove Python cache files -# run: find . -name '*.pyc' -delete -# -# - name: Check R environment status -# run: Rscript -e "renv::status()" -# -# - name: Synchronize R environment -# run: Rscript -e "renv::sync()" -# -# - name: Check R package (with virtual environment) -# run: | -# source flair_env/bin/activate -# R CMD build . --no-build-vignettes -# R CMD check *tar.gz --no-build-vignettes --no-manual --no-examples -# shell: bash -# -# # -# name: R-ubuntu -# -# on: -# push: -# branches: -# - main -# pull_request: -# branches: -# - main -# -# jobs: -# R-CMD-check: -# runs-on: ubuntu-20.04 -# strategy: -# matrix: -# r-version: ['4.3.2', '4.2.0', '4.2.1'] -# -# env: -# R_LIBS_USER: /home/runner/work/_temp/Library -# TZ: UTC -# _R_CHECK_SYSTEM_CLOCK_: FALSE -# NOT_CRAN: true -# RSPM: https://packagemanager.posit.co/cran/__linux__/focal/latest -# RENV_CONFIG_REPOS_OVERRIDE: https://packagemanager.posit.co/cran/__linux__/focal/latest -# -# steps: -# - uses: actions/checkout@v3 -# -# - name: Cache R dependencies -# uses: actions/cache@v2 -# with: -# path: ~/R/x86_64-pc-linux-gnu-library/ -# key: ${{ runner.os }}-r-${{ matrix.r-version }}-${{ hashFiles('**/renv.lock') }} -# restore-keys: ${{ runner.os }}-r-${{ matrix.r-version }}- -# -# - name: Setup R -# uses: r-lib/actions/setup-r@v2 -# with: -# use-public-rspm: true -# r-version: ${{ matrix.r-version }} -# -# - name: Restore R environment -# run: | -# if (!requireNamespace('renv', quietly = TRUE)) install.packages('renv') -# renv::restore() -# shell: Rscript {0} -# -# - name: Install additional R packages -# env: -# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the default GitHub token for authentication -# run: | -# install.packages(c("knitr", "rmarkdown", "lsa", "purrr", "ggplot2")) -# install.packages('remotes') -# remotes::install_github("davidycliao/flaiR", auth_token = Sys.getenv("GITHUB_TOKEN"), force = TRUE) -# shell: Rscript {0} -# -# - name: Set up Python -# uses: actions/setup-python@v2 -# with: -# python-version: '3.10.x' -# -# - name: Install Python virtualenv -# run: pip install virtualenv -# -# - name: Create Python virtual environment -# run: virtualenv flair_env -# -# - name: Install Python dependencies in virtual environment -# run: | -# source flair_env/bin/activate -# pip install --upgrade pip -# pip install scipy==1.12.0 -# pip install flair -# pip install gensim -# -# - name: Remove Python cache files -# run: find . -name '*.pyc' -delete -# -# - name: Check R environment status -# run: renv::status() -# shell: Rscript {0} -# -# # - name: Check R package (with virtual environment) -# # run: | -# # source flair_env/bin/activate -# # R CMD build . --no-build-vignettes -# # R CMD check *tar.gz --no-build-vignettes --no-manual --no-tests --no-examples -# # shell: bash name: R-ubuntu diff --git a/R/predict_label.R b/R/predict_label.R index a8108ae1..d28fe799 100644 --- a/R/predict_label.R +++ b/R/predict_label.R @@ -1,88 +1,96 @@ #' Predict Text Label Using Flair Classifier #' -#' This function predicts the label of input text using a Flair classifier, -#' with options for confidence thresholding to adjust predictions to NEUTRAL. -#' #' @param text A character string containing the text to be labeled #' @param classifier A Flair TextClassifier object for making predictions -#' @param threshold_score Numeric value between 0 and 1 representing the confidence -#' threshold for label classification. Defaults to 0.5 if not specified -#' @param threshold Logical indicating whether to apply the threshold_score to -#' adjust predictions to NEUTRAL. Defaults to FALSE +#' @param sentence Optional Flair Sentence object. If NULL, one will be created from text #' -#' @return A list containing the following elements: +#' @return A list containing: #' \describe{ -#' \item{label}{Character string of the final predicted label (AGAINST/FAVOR/NEUTRAL)} -#' \item{score}{Numeric confidence score from the classifier} -#' \item{token_number}{Integer count of tokens in the input text} -#' \item{threshold_score}{Numeric value of the threshold used} -#' \item{original_label}{Character string of the classifier's original prediction -#' before thresholding} +#' \item{label}{Character string of predicted label} +#' \item{score}{Numeric confidence score from classifier} +#' \item{token_number}{Integer count of tokens in input text} #' } #' #' @examples #' \dontrun{ -#' # Load a pre-trained classifier -#' classifier <- flair$models$TextClassifier$load('stance-classifier') -#' -#' # Predict label without thresholding +#' # Example 1: Using text input +#' classifier <- flair_models()$TextClassifier$load('stance-classifier') #' result1 <- predict_label( #' text = "I strongly support this policy", #' classifier = classifier #' ) #' -#' # Predict with custom threshold +#' # Example 2: Using pre-created and tagged sentence +#' sent <- Sentence("I love Berlin and New York.") +#' tagger <- flair_models()$SequenceTagger$load('pos') +#' tagger$predict(sent) +#' print(sent) # Shows tokens with POS tags +#' #' result2 <- predict_label( -#' text = "I somewhat agree with the proposal", +#' text = NULL, #' classifier = classifier, -#' threshold_score = 0.7, -#' threshold = TRUE +#' sentence = sent #' ) #' } #' -#' @details -#' The function will throw an error if the classifier is NULL or not a -#' Flair TextClassifier. -#' #' @import flaiR #' @export -predict_label <- function(text, classifier, threshold_score = NULL, threshold = FALSE) { - # Check if classifier is provided and valid +predict_label <- function(text, classifier, sentence = NULL) { + + # Check if classifier is valid if (is.null(classifier) || !isTRUE(class(classifier)[1] == "flair.models.text_classification_model.TextClassifier")) { stop("Invalid or missing classifier. Please provide a pre-trained Flair TextClassifier model.") } - # Create a sentence object - sentence <- Sentence(text) + # Check if Sentence exists and is correctly loaded + if (!("python.builtin.type" %in% class(Sentence))) { + stop("Sentence class not found or not properly loaded. Please ensure flaiR is properly loaded.") + } - # Use the classifier to predict - classifier$predict(sentence) + # Check if either text or sentence is provided + if (is.null(text) && is.null(sentence)) { + stop("Either text or sentence must be provided") + } - # Get the predicted label and score - predicted_label <- sentence$labels[[1]]$value - score <- sentence$labels[[1]]$score # 移除 as.numeric - token_number <- length(sentence$tokens) + # Create or validate sentence + if (is.null(sentence)) { + tryCatch({ + sentence <- Sentence(text) + }, error = function(e) { + stop("Failed to create Sentence object: ", e$message) + }) + } else { + # Enhanced sentence validation + if (!inherits(sentence, "flair.data.Sentence")) { + stop("Invalid sentence object. Must be a Flair Sentence instance.") + } - # Set default threshold_score if NULL - if (is.null(threshold_score)) { - threshold_score <- 0.5 + if (!("tokens" %in% names(sentence)) || length(sentence$tokens) == 0) { + stop("Invalid sentence object: No tokens found.") + } } - # Modify label based on the score threshold and original label - original_label <- predicted_label - if (threshold && score < threshold_score) { - if (predicted_label %in% c("AGAINST", "FAVOR")) { - predicted_label <- "NEUTRAL" - } + # Use the classifier to predict + tryCatch({ + classifier$predict(sentence) + }, error = function(e) { + stop("Prediction failed: ", e$message) + }) + + # Verify prediction results + if (length(sentence$labels) == 0) { + stop("No prediction labels generated") } - # Construct the prediction result - prediction = list(label = predicted_label, - score = score, - token_number = token_number, - threshold_score = threshold_score, - original_label = original_label) + # Get prediction details + predicted_label <- sentence$labels[[1]]$value + score <- sentence$labels[[1]]$score + token_number <- length(sentence$tokens) - # Return the prediction result - return(prediction) + # Return results + return(list( + label = predicted_label, + score = score, + token_number = token_number + )) } diff --git a/man/predict_label.Rd b/man/predict_label.Rd index 76632910..0acd9b57 100644 --- a/man/predict_label.Rd +++ b/man/predict_label.Rd @@ -4,55 +4,45 @@ \alias{predict_label} \title{Predict Text Label Using Flair Classifier} \usage{ -predict_label(text, classifier, threshold_score = NULL, threshold = FALSE) +predict_label(text, classifier, sentence = NULL) } \arguments{ \item{text}{A character string containing the text to be labeled} \item{classifier}{A Flair TextClassifier object for making predictions} -\item{threshold_score}{Numeric value between 0 and 1 representing the confidence -threshold for label classification. Defaults to 0.5 if not specified} - -\item{threshold}{Logical indicating whether to apply the threshold_score to -adjust predictions to NEUTRAL. Defaults to FALSE} +\item{sentence}{Optional Flair Sentence object. If NULL, one will be created from text} } \value{ -A list containing the following elements: +A list containing: \describe{ -\item{label}{Character string of the final predicted label (AGAINST/FAVOR/NEUTRAL)} -\item{score}{Numeric confidence score from the classifier} -\item{token_number}{Integer count of tokens in the input text} -\item{threshold_score}{Numeric value of the threshold used} -\item{original_label}{Character string of the classifier's original prediction -before thresholding} +\item{label}{Character string of predicted label} +\item{score}{Numeric confidence score from classifier} +\item{token_number}{Integer count of tokens in input text} } } \description{ -This function predicts the label of input text using a Flair classifier, -with options for confidence thresholding to adjust predictions to NEUTRAL. -} -\details{ -The function will throw an error if the classifier is NULL or not a -Flair TextClassifier. +Predict Text Label Using Flair Classifier } \examples{ \dontrun{ -# Load a pre-trained classifier -classifier <- flair$models$TextClassifier$load('stance-classifier') - -# Predict label without thresholding +# Example 1: Using text input +classifier <- flair_models()$TextClassifier$load('stance-classifier') result1 <- predict_label( text = "I strongly support this policy", classifier = classifier ) -# Predict with custom threshold +# Example 2: Using pre-created and tagged sentence +sent <- Sentence("I love Berlin and New York.") +tagger <- flair_models()$SequenceTagger$load('pos') +tagger$predict(sent) +print(sent) # Shows tokens with POS tags + result2 <- predict_label( - text = "I somewhat agree with the proposal", + text = NULL, classifier = classifier, - threshold_score = 0.7, - threshold = TRUE + sentence = sent ) } diff --git a/vignettes/tutorial.Rmd b/vignettes/tutorial.Rmd index f7ef0591..f4bdc6d5 100644 --- a/vignettes/tutorial.Rmd +++ b/vignettes/tutorial.Rmd @@ -1364,7 +1364,6 @@ old_text <- map(cc_muller_old$text, Sentence) old_labels <- as.character(cc_muller_old$class) old_text <- map2(old_text, old_labels, ~ { - .x$add_label("classification", .y) .x }) @@ -1508,7 +1507,51 @@ After fine-tuning for 1 epoch, the model showed improved performance on the same More R tutorial and documentation see [here](https://github.com/davidycliao/flaiR). +## Using Your Own Fine-tuned Model in flaiR +This seciton demonstrates how to utilize your custom fine-tuned model in flaiR for text classification tasks. Let's explore this process step by step. + +__Setting Up Your Environment__ + +First, we need to load the flaiR package and prepare our model: + + +```{r} +library(flaiR) +classifier <- flair_models()$TextClassifier$load('vignettes/inst/new-muller-campaign-communication/best-model.pt') +``` + +It's important to verify your model's compatibility with `$model_card`. You can check this by examining the version requirements: + +```{r} +print(classifier$model_card) + +``` + +```{r} +# Check required versions +print(classifier$model_card$transformers_version) # Required transformers version +print(classifier$model_card$flair_version) # Required Flair version +``` + +__Making Predictions__ + +To make predictions, we first need to prepare our text by creating a Sentence object. This is a key component in Flair's architecture that handles text processing: + +```{r} +# Get the Sentence class from flaiR +Sentence <- flair_data()$Sentence + +# Create a Sentence object with your text +sentence <- Sentence("And to boost the housing we need, we will start to build a new generation of garden cities.") + +# Make prediction +classifier$predict(sentence) + +# Access prediction results +prediction <- sentence$labels[[1]]$value # Get predicted label +confidence <- sentence$labels[[1]]$score # Get confidence score +```   From fa119fba041e47b4fb20b8ee0d29a8a49087e80a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 29 Dec 2024 17:19:00 +0000 Subject: [PATCH 36/46] Bump matplotlib from 3.9.2 to 3.10.0 Bumps [matplotlib](https://github.com/matplotlib/matplotlib) from 3.9.2 to 3.10.0. - [Release notes](https://github.com/matplotlib/matplotlib/releases) - [Commits](https://github.com/matplotlib/matplotlib/compare/v3.9.2...v3.10.0) --- updated-dependencies: - dependency-name: matplotlib dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 483256e3..dac32957 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,7 +32,7 @@ lazy_loader==0.4 lime==0.2.0.1 lxml==4.9.3 MarkupSafe==3.0.2 -matplotlib==3.9.2 +matplotlib==3.10.0 more-itertools==10.5.0 mpld3==0.5.10 mpmath==1.3.0 From 50d894f8cca14f7a532f0e73289e0a975723518e Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sun, 29 Dec 2024 18:08:03 +0000 Subject: [PATCH 37/46] UPLOAD docker --- .github/docker/build-push-action.yml | 92 ++++++++++++++++++++++++++++ Dockerfile | 33 ++++++++++ vignettes/tutorial.Rmd | 5 +- 3 files changed, 127 insertions(+), 3 deletions(-) create mode 100644 .github/docker/build-push-action.yml create mode 100644 Dockerfile diff --git a/.github/docker/build-push-action.yml b/.github/docker/build-push-action.yml new file mode 100644 index 00000000..b7d518af --- /dev/null +++ b/.github/docker/build-push-action.yml @@ -0,0 +1,92 @@ +name: flaiR-Installation-Check + +on: + push: + branches: [main, master] +pull_request: + branches: [main, master] + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} +name: ${{ matrix.config.os }} (${{ matrix.config.r }}) +strategy: + fail-fast: false +matrix: + config: + - {os: macos-latest, r: 'release'} +- {os: windows-latest, r: 'release'} +- {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} +- {os: ubuntu-latest, r: 'release'} + +env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} +R_KEEP_PKG_SOURCE: yes + +steps: + - uses: actions/checkout@v2 + +- uses: r-lib/actions/setup-r@v2 +with: + r-version: ${{ matrix.config.r }} +http-user-agent: ${{ matrix.config.http-user-agent }} +use-public-rspm: true + +- uses: r-lib/actions/setup-pandoc@v2 + +- name: Setup Python +uses: actions/setup-python@v2 +with: + python-version: '3.9' + +- name: Check Python Version +run: python --version + +- name: Install Python dependencies +run: | + python -m pip install --upgrade pip +pip install flair + +- name: Install R dependencies +run: | + install.packages('remotes') +remotes::install_github("davidycliao/flaiR", force = TRUE) +shell: Rscript {0} + +- uses: r-lib/actions/setup-r-dependencies@v2 +with: + extra-packages: rcmdcheck + +- name: Setup Python (Only on ubuntu-latest) +if: matrix.config.os == 'ubuntu-latest' +uses: actions/setup-python@v2 +with: + python-version: '3.10.13' + +- name: Install Python venv and dependencies (Only on ubuntu-latest) +if: matrix.config.os == 'ubuntu-latest' +run: | + sudo apt-get update +sudo apt-get install -y python3-venv +python -m venv ~/.venv +echo "RETICULATE_PYTHON=~/.venv/bin/python" >> $GITHUB_ENV +source ~/.venv/bin/activate +pip install scipy==1.12.0 + +- name: Install Pandoc (Only on Windows) +if: matrix.config.os == 'windows-latest' +run: choco install pandoc + +- name: Install Python dependencies (Only on Windows) +if: matrix.config.os == 'windows-latest' +run: | + python -m pip install --upgrade pip +pip install scipy==1.12.0 +pip install flair + +- name: Install Python dependencies (Only on macOS) +if: matrix.config.os == 'macos-latest' +run: | + python -m pip install --upgrade pip +pip install scipy==1.12.0 +pip install flair diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..05002c51 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,33 @@ +name: Docker Build and Push + +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +jobs: + build-and-push: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v1 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push Docker image + uses: docker/build-push-action@v2 + with: + context: . # 指定构建上下文为当前目录 + file: ./Dockerfile # 指定 Dockerfile 的路径 + push: true + tags: ghcr.io/yourusername/your-repo:latest diff --git a/vignettes/tutorial.Rmd b/vignettes/tutorial.Rmd index f4bdc6d5..611a9b16 100644 --- a/vignettes/tutorial.Rmd +++ b/vignettes/tutorial.Rmd @@ -1300,7 +1300,6 @@ trainer$train('models/sentiment', max_epochs=5L) ``` - ## To Apply the Trained Model for Prediction ```{r eval=FALSE, include=TRUE} @@ -1511,7 +1510,7 @@ More R tutorial and documentation see [here](https://github.com/davidycliao/flai This seciton demonstrates how to utilize your custom fine-tuned model in flaiR for text classification tasks. Let's explore this process step by step. -__Setting Up Your Environment__ +**Setting Up Your Environment** First, we need to load the flaiR package and prepare our model: @@ -1534,7 +1533,7 @@ print(classifier$model_card$transformers_version) # Required transformers versi print(classifier$model_card$flair_version) # Required Flair version ``` -__Making Predictions__ +**Making Predictions** To make predictions, we first need to prepare our text by creating a Sentence object. This is a key component in Flair's architecture that handles text processing: From 4534fa34232611bd128bd3f1f2dfa4f20683fef9 Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sun, 29 Dec 2024 18:18:39 +0000 Subject: [PATCH 38/46] UPDATE --- .github/docker/build-push-action.yml | 92 ---------------------- .github/workflows/build-push-action.yml | 100 ++++++++++++++++++++++++ data/.DS_Store | Bin 0 -> 6148 bytes docker/.DS_Store | Bin 0 -> 6148 bytes docker/.dockerignore | 12 +++ Dockerfile => docker/Dockerfile | 0 6 files changed, 112 insertions(+), 92 deletions(-) delete mode 100644 .github/docker/build-push-action.yml create mode 100644 .github/workflows/build-push-action.yml create mode 100644 data/.DS_Store create mode 100644 docker/.DS_Store create mode 100644 docker/.dockerignore rename Dockerfile => docker/Dockerfile (100%) diff --git a/.github/docker/build-push-action.yml b/.github/docker/build-push-action.yml deleted file mode 100644 index b7d518af..00000000 --- a/.github/docker/build-push-action.yml +++ /dev/null @@ -1,92 +0,0 @@ -name: flaiR-Installation-Check - -on: - push: - branches: [main, master] -pull_request: - branches: [main, master] - -jobs: - R-CMD-check: - runs-on: ${{ matrix.config.os }} -name: ${{ matrix.config.os }} (${{ matrix.config.r }}) -strategy: - fail-fast: false -matrix: - config: - - {os: macos-latest, r: 'release'} -- {os: windows-latest, r: 'release'} -- {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} -- {os: ubuntu-latest, r: 'release'} - -env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} -R_KEEP_PKG_SOURCE: yes - -steps: - - uses: actions/checkout@v2 - -- uses: r-lib/actions/setup-r@v2 -with: - r-version: ${{ matrix.config.r }} -http-user-agent: ${{ matrix.config.http-user-agent }} -use-public-rspm: true - -- uses: r-lib/actions/setup-pandoc@v2 - -- name: Setup Python -uses: actions/setup-python@v2 -with: - python-version: '3.9' - -- name: Check Python Version -run: python --version - -- name: Install Python dependencies -run: | - python -m pip install --upgrade pip -pip install flair - -- name: Install R dependencies -run: | - install.packages('remotes') -remotes::install_github("davidycliao/flaiR", force = TRUE) -shell: Rscript {0} - -- uses: r-lib/actions/setup-r-dependencies@v2 -with: - extra-packages: rcmdcheck - -- name: Setup Python (Only on ubuntu-latest) -if: matrix.config.os == 'ubuntu-latest' -uses: actions/setup-python@v2 -with: - python-version: '3.10.13' - -- name: Install Python venv and dependencies (Only on ubuntu-latest) -if: matrix.config.os == 'ubuntu-latest' -run: | - sudo apt-get update -sudo apt-get install -y python3-venv -python -m venv ~/.venv -echo "RETICULATE_PYTHON=~/.venv/bin/python" >> $GITHUB_ENV -source ~/.venv/bin/activate -pip install scipy==1.12.0 - -- name: Install Pandoc (Only on Windows) -if: matrix.config.os == 'windows-latest' -run: choco install pandoc - -- name: Install Python dependencies (Only on Windows) -if: matrix.config.os == 'windows-latest' -run: | - python -m pip install --upgrade pip -pip install scipy==1.12.0 -pip install flair - -- name: Install Python dependencies (Only on macOS) -if: matrix.config.os == 'macos-latest' -run: | - python -m pip install --upgrade pip -pip install scipy==1.12.0 -pip install flair diff --git a/.github/workflows/build-push-action.yml b/.github/workflows/build-push-action.yml new file mode 100644 index 00000000..96ae85c2 --- /dev/null +++ b/.github/workflows/build-push-action.yml @@ -0,0 +1,100 @@ +name: flaiR-Installation-Check + +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + strategy: + fail-fast: false + matrix: + config: + - {os: macos-latest, r: 'release'} + - {os: windows-latest, r: 'release'} + - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} + - {os: ubuntu-latest, r: 'release'} + + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + R_KEEP_PKG_SOURCE: yes + + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.config.r }} + http-user-agent: ${{ matrix.config.http-user-agent }} + use-public-rspm: true + + - uses: r-lib/actions/setup-pandoc@v2 + + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: '3.9' + + - name: Check Python Version + run: python --version + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install flair + + - name: Install R dependencies + run: | + install.packages('remotes') + remotes::install_github("davidycliao/flaiR", force = TRUE) + shell: Rscript {0} + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: rcmdcheck + + - name: Setup Python (Only on ubuntu-latest) + if: matrix.config.os == 'ubuntu-latest' + uses: actions/setup-python@v2 + with: + python-version: '3.10.13' + + - name: Install Python venv and dependencies (Only on ubuntu-latest) + if: matrix.config.os == 'ubuntu-latest' + run: | + sudo apt-get update + sudo apt-get install -y python3-venv + python -m venv ~/.venv + echo "RETICULATE_PYTHON=~/.venv/bin/python" >> $GITHUB_ENV + source ~/.venv/bin/activate + pip install scipy==1.12.0 + + - name: Install Pandoc (Only on Windows) + if: matrix.config.os == 'windows-latest' + run: choco install pandoc + + - name: Install Python dependencies (Only on Windows) + if: matrix.config.os == 'windows-latest' + run: | + python -m pip install --upgrade pip + pip install scipy==1.12.0 + pip install flair + + - name: Install Python dependencies (Only on macOS) + if: matrix.config.os == 'macos-latest' + run: | + python -m pip install --upgrade pip + pip install scipy==1.12.0 + pip install flair + + - name: Build and push Docker image + uses: docker/build-push-action@v2 + with: + context: . + file: ./docker/Dockerfile + push: true + tags: ghcr.io/davidycliao/flaiR:latest diff --git a/data/.DS_Store b/data/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Sun, 29 Dec 2024 18:50:05 +0000 Subject: [PATCH 39/46] UPDATE --- .github/workflows/build-push-action.yml | 17 +++++++++++++++++ docker/Dockerfile | 6 +++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-push-action.yml b/.github/workflows/build-push-action.yml index 96ae85c2..6d79bb8f 100644 --- a/.github/workflows/build-push-action.yml +++ b/.github/workflows/build-push-action.yml @@ -91,6 +91,23 @@ jobs: pip install scipy==1.12.0 pip install flair + docker-build-and-push: + runs-on: ubuntu-latest + needs: R-CMD-check # 依賴 R-CMD-check 完成後再執行 + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v1 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Build and push Docker image uses: docker/build-push-action@v2 with: diff --git a/docker/Dockerfile b/docker/Dockerfile index 05002c51..6bbda0bf 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -27,7 +27,7 @@ jobs: - name: Build and push Docker image uses: docker/build-push-action@v2 with: - context: . # 指定构建上下文为当前目录 - file: ./Dockerfile # 指定 Dockerfile 的路径 + context: . # + file: ./Dockerfile push: true - tags: ghcr.io/yourusername/your-repo:latest + tags: ghcr.io/davidycliao/flaiR:latest From a13d86ef21076a4bc44790a2083126b3018a273f Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sun, 29 Dec 2024 21:36:27 +0000 Subject: [PATCH 40/46] UPDATE --- .github/workflows/build-push-action.yml | 26 +++---------- docker/Dockerfile | 51 ++++++++++++------------- 2 files changed, 29 insertions(+), 48 deletions(-) diff --git a/.github/workflows/build-push-action.yml b/.github/workflows/build-push-action.yml index 6d79bb8f..6588413a 100644 --- a/.github/workflows/build-push-action.yml +++ b/.github/workflows/build-push-action.yml @@ -1,4 +1,4 @@ -name: flaiR-Installation-Check +name: flaiR-Docker on: push: @@ -91,27 +91,11 @@ jobs: pip install scipy==1.12.0 pip install flair - docker-build-and-push: - runs-on: ubuntu-latest - needs: R-CMD-check # 依賴 R-CMD-check 完成後再執行 - steps: - - name: Checkout code - uses: actions/checkout@v2 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v1 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Build and push Docker image + - name: Build and push Docker image (Only on ubuntu-latest) + if: matrix.config.os == 'ubuntu-latest' uses: docker/build-push-action@v2 with: context: . - file: ./docker/Dockerfile + file: ./Dockerfile push: true - tags: ghcr.io/davidycliao/flaiR:latest + tags: ghcr.io/yourusername/your-repo:latest diff --git a/docker/Dockerfile b/docker/Dockerfile index 6bbda0bf..ea850c96 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,33 +1,30 @@ -name: Docker Build and Push +FROM r-base:latest -on: - push: - branches: [main, master] - pull_request: - branches: [main, master] +# 安裝系統依賴 +RUN apt-get update && apt-get install -y \ + python3 \ + python3-pip \ + libcurl4-openssl-dev \ + libssl-dev \ + libxml2-dev -jobs: - build-and-push: - runs-on: ubuntu-latest +# 安裝 Python Flair NLP +RUN pip3 install flair - steps: - - name: Checkout code - uses: actions/checkout@v2 +# 安裝 R 依賴項 +RUN R -e "install.packages('remotes')" +RUN R -e "install.packages(c('data.table', 'reticulate', 'curl', 'attempt', 'htmltools', 'stringr'))" +RUN R -e "install.packages(c('knitr', 'rmarkdown', 'lsa', 'purrr', 'jsonlite', 'ggplot2', 'plotly', 'testthat'))" - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 +# 複製 R 套件到容器中 +COPY . /usr/src/my_pkg +WORKDIR /usr/src/my_pkg - - name: Login to GitHub Container Registry - uses: docker/login-action@v1 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} +# 安裝 R 套件 +RUN R -e "remotes::install_local(force = TRUE)" - - name: Build and push Docker image - uses: docker/build-push-action@v2 - with: - context: . # - file: ./Dockerfile - push: true - tags: ghcr.io/davidycliao/flaiR:latest +# 清理不必要的文件 +RUN rm -rf /usr/src/my_pkg + +# 設定預設命令 +CMD ["R"] From 332e9784550ffe5e50784bb71a7b7c19243255ad Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sun, 29 Dec 2024 21:39:44 +0000 Subject: [PATCH 41/46] Update build-push-action.yml --- .github/workflows/build-push-action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-push-action.yml b/.github/workflows/build-push-action.yml index 6588413a..0a55be24 100644 --- a/.github/workflows/build-push-action.yml +++ b/.github/workflows/build-push-action.yml @@ -98,4 +98,4 @@ jobs: context: . file: ./Dockerfile push: true - tags: ghcr.io/yourusername/your-repo:latest + tags: ghcr.io/davidycliao/flaiR:latest From 15e8bdb59f449bed079d5d240806137273ae297e Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sun, 29 Dec 2024 21:42:58 +0000 Subject: [PATCH 42/46] Update build-push-action.yml --- .github/workflows/build-push-action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-push-action.yml b/.github/workflows/build-push-action.yml index 0a55be24..bb7f167c 100644 --- a/.github/workflows/build-push-action.yml +++ b/.github/workflows/build-push-action.yml @@ -96,6 +96,6 @@ jobs: uses: docker/build-push-action@v2 with: context: . - file: ./Dockerfile + file: ./docker/Dockerfile push: true tags: ghcr.io/davidycliao/flaiR:latest From 5313ae7f1ce4b3b64284d84f35e8be751ec78cd4 Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sun, 29 Dec 2024 21:50:58 +0000 Subject: [PATCH 43/46] UPDATE --- .github/workflows/build-push-action.yml | 2 +- docker/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-push-action.yml b/.github/workflows/build-push-action.yml index bb7f167c..e9960e31 100644 --- a/.github/workflows/build-push-action.yml +++ b/.github/workflows/build-push-action.yml @@ -98,4 +98,4 @@ jobs: context: . file: ./docker/Dockerfile push: true - tags: ghcr.io/davidycliao/flaiR:latest + tags: ghcr.io/davidycliao/flair:latest diff --git a/docker/Dockerfile b/docker/Dockerfile index ea850c96..29ef7410 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,5 +1,5 @@ FROM r-base:latest - +LABEL maintainer="Yen-Chieh Liao " # 安裝系統依賴 RUN apt-get update && apt-get install -y \ python3 \ From efe637052087067eee23ac2b5d5adb39feebc6d0 Mon Sep 17 00:00:00 2001 From: davidycliao Date: Sun, 29 Dec 2024 22:15:23 +0000 Subject: [PATCH 44/46] UPDATE --- docker/.dockerignore => .dockerignore | 0 docker/Dockerfile => Dockerfile | 0 docker/.DS_Store | Bin 6148 -> 0 bytes 3 files changed, 0 insertions(+), 0 deletions(-) rename docker/.dockerignore => .dockerignore (100%) rename docker/Dockerfile => Dockerfile (100%) delete mode 100644 docker/.DS_Store diff --git a/docker/.dockerignore b/.dockerignore similarity index 100% rename from docker/.dockerignore rename to .dockerignore diff --git a/docker/Dockerfile b/Dockerfile similarity index 100% rename from docker/Dockerfile rename to Dockerfile diff --git a/docker/.DS_Store b/docker/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Sun, 29 Dec 2024 22:29:43 +0000 Subject: [PATCH 45/46] Update build-push-action.yml --- .github/workflows/build-push-action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-push-action.yml b/.github/workflows/build-push-action.yml index e9960e31..4b190f66 100644 --- a/.github/workflows/build-push-action.yml +++ b/.github/workflows/build-push-action.yml @@ -96,6 +96,6 @@ jobs: uses: docker/build-push-action@v2 with: context: . - file: ./docker/Dockerfile + file: ./Dockerfile push: true tags: ghcr.io/davidycliao/flair:latest From ca6513de873b0bb1f2429317c338c52a962373aa Mon Sep 17 00:00:00 2001 From: David Liao <35374714+davidycliao@users.noreply.github.com> Date: Sun, 29 Dec 2024 22:30:24 +0000 Subject: [PATCH 46/46] Rename build-push-action.yml to docker-publish.yml --- .github/workflows/{build-push-action.yml => docker-publish.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{build-push-action.yml => docker-publish.yml} (100%) diff --git a/.github/workflows/build-push-action.yml b/.github/workflows/docker-publish.yml similarity index 100% rename from .github/workflows/build-push-action.yml rename to .github/workflows/docker-publish.yml