diff --git a/.Renviron b/.Renviron index 383154df..dfc6d026 100644 --- a/.Renviron +++ b/.Renviron @@ -1 +1 @@ -_R_CHECK_FORCE_SUGGESTS_=false +_R_CHECK_FORCE_SUGGESTS_=TRUE diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..f8b25cfa --- /dev/null +++ b/.dockerignore @@ -0,0 +1,12 @@ +.git/ +.gitignore + +.Rhistory +.RData +.Rproj.user +*.tar.gz +*.Rcheck/ +tests/testthat/_snaps/ +docs/ +.DS_Store +Thumbs.db diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml new file mode 100644 index 00000000..4b190f66 --- /dev/null +++ b/.github/workflows/docker-publish.yml @@ -0,0 +1,101 @@ +name: flaiR-Docker + +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + strategy: + fail-fast: false + matrix: + config: + - {os: macos-latest, r: 'release'} + - {os: windows-latest, r: 'release'} + - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} + - {os: ubuntu-latest, r: 'release'} + + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + R_KEEP_PKG_SOURCE: yes + + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.config.r }} + http-user-agent: ${{ matrix.config.http-user-agent }} + use-public-rspm: true + + - uses: r-lib/actions/setup-pandoc@v2 + + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: '3.9' + + - name: Check Python Version + run: python --version + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install flair + + - name: Install R dependencies + run: | + install.packages('remotes') + remotes::install_github("davidycliao/flaiR", force = TRUE) + shell: Rscript {0} + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: rcmdcheck + + - name: Setup Python (Only on ubuntu-latest) + if: matrix.config.os == 'ubuntu-latest' + uses: actions/setup-python@v2 + with: + python-version: '3.10.13' + + - name: Install Python venv and dependencies (Only on ubuntu-latest) + if: matrix.config.os == 'ubuntu-latest' + run: | + sudo apt-get update + sudo apt-get install -y python3-venv + python -m venv ~/.venv + echo "RETICULATE_PYTHON=~/.venv/bin/python" >> $GITHUB_ENV + source ~/.venv/bin/activate + pip install scipy==1.12.0 + + - name: Install Pandoc (Only on Windows) + if: matrix.config.os == 'windows-latest' + run: choco install pandoc + + - name: Install Python dependencies (Only on Windows) + if: matrix.config.os == 'windows-latest' + run: | + python -m pip install --upgrade pip + pip install scipy==1.12.0 + pip install flair + + - name: Install Python dependencies (Only on macOS) + if: matrix.config.os == 'macos-latest' + run: | + python -m pip install --upgrade pip + pip install scipy==1.12.0 + pip install flair + + - name: Build and push Docker image (Only on ubuntu-latest) + if: matrix.config.os == 'ubuntu-latest' + uses: docker/build-push-action@v2 + with: + context: . + file: ./Dockerfile + push: true + tags: ghcr.io/davidycliao/flair:latest diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 6ef0e956..a3309209 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -5,82 +5,17 @@ # # See https://github.com/r-lib/actions/tree/master/examples#readme for # additional example workflows available for the R community. -# on: -# push: -# branches: [main, master] -# pull_request: -# branches: [main, master] -# -# name: R-CMD-check -# -# jobs: -# R-CMD-check: -# runs-on: ${{ matrix.config.os }} -# -# name: ${{ matrix.config.os }} (${{ matrix.config.r }}) -# -# strategy: -# fail-fast: false -# matrix: -# config: -# - {os: macos-latest, r: 'release'} -# - {os: windows-latest, r: 'release'} -# - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} -# - {os: ubuntu-latest, r: 'release'} -# env: -# GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} -# R_KEEP_PKG_SOURCE: yes -# -# steps: -# - uses: actions/checkout@v2 -# -# - uses: r-lib/actions/setup-pandoc@v2 -# -# -# - name: Setup Python (Only on ubuntu-latest) -# if: matrix.config.os == 'ubuntu-latest' -# uses: actions/setup-python@v2 -# with: -# python-version: '3.x' -# -# - name: Install Python venv and dependencies (Only on ubuntu-latest) -# if: matrix.config.os == 'ubuntu-latest' -# run: | -# sudo apt-get update -# sudo apt-get install -y python3-venv -# python -m venv ~/.venv -# echo "RETICULATE_PYTHON=~/.venv/bin/python" >> $GITHUB_ENV -# source ~/.venv/bin/activate -# - uses: r-lib/actions/setup-r@v2 -# with: -# r-version: ${{ matrix.config.r }} -# http-user-agent: ${{ matrix.config.http-user-agent }} -# use-public-rspm: true -# -# - name: Install reticulate (Only on ubuntu-latest) -# if: matrix.config.os == 'ubuntu-latest' -# run: | -# Rscript -e "install.packages('reticulate', repos = 'https://cloud.r-project.org/')" -# -# - uses: r-lib/actions/setup-r-dependencies@v2 -# with: -# extra-packages: any::rcmdcheck -# needs: check -# -# - uses: r-lib/actions/check-r-package@v2 -# with: -# upload-snapshots: true -# - -on: - push: - branches: [main, master] - pull_request: - branches: [main, master] name: R-CMD-check +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + jobs: + R-CMD-check: runs-on: ${{ matrix.config.os }} @@ -92,8 +27,7 @@ jobs: config: - {os: macos-latest, r: 'release'} - {os: windows-latest, r: 'release'} - - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - - {os: ubuntu-latest, r: 'release'} + env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} R_KEEP_PKG_SOURCE: yes @@ -166,6 +100,3 @@ jobs: with: extra-packages: rcmdcheck - # - uses: r-lib/actions/check-r-package@v2 - # with: - # upload-snapshots: true diff --git a/.github/workflows/r2.yml b/.github/workflows/r2.yml deleted file mode 100644 index 6509636d..00000000 --- a/.github/workflows/r2.yml +++ /dev/null @@ -1,48 +0,0 @@ -# This workflow uses actions that are not certified by GitHub. -# They are provided by a third-party and are governed by -# separate terms of service, privacy policy, and support -# documentation. -# -# See https://github.com/r-lib/actions/tree/master/examples#readme for -# additional example workflows available for the R community. - -name: R - -on: - push: - branches: [ "main" ] - pull_request: - branches: [ "main" ] - -permissions: - contents: read - -jobs: - build: - runs-on: macos-latest - strategy: - matrix: - r-version: ['4.0.0', '4.1.0', '4.1.1'] - steps: - - uses: actions/checkout@v3 - - - name: Install pandoc - run: | - brew install pandoc - - - name: Set up R ${{ matrix.r-version }} - uses: r-lib/actions/setup-r@v1 - with: - r-version: ${{ matrix.r-version }} - - - name: Install dependencies - run: | - install.packages(c("remotes", "rcmdcheck")) - remotes::install_deps(dependencies = TRUE) - shell: Rscript {0} - - - name: Check - run: | - R CMD build --no-build-vignettes . - shell: bash - diff --git a/.github/workflows/r_macos.yml b/.github/workflows/r_macos.yml index 0f531561..da8705d5 100644 --- a/.github/workflows/r_macos.yml +++ b/.github/workflows/r_macos.yml @@ -5,6 +5,7 @@ # # See https://github.com/r-lib/actions/tree/master/examples#readme for # additional example workflows available for the R community. + name: R-MacOS on: @@ -19,69 +20,105 @@ permissions: jobs: build: runs-on: macos-latest - strategy: matrix: - r-version: ['4.4.0', '4.3.2'] + r-version: ['4.4.0', '4.2.3'] # 改為 4.2.3 + fail-fast: false steps: - - uses: actions/checkout@v3 - - - name: Update Homebrew - run: | - brew update - - - name: Install pandoc - run: | - for i in {1..3}; do - brew install pandoc && break || sleep 15 - done - - - name: Install gfortran and configure Makevars - run: | - brew install gcc - mkdir -p ~/.R - touch ~/.R/Makevars - echo "FC=$(brew --prefix)/bin/gfortran" >> ~/.R/Makevars - echo "F77=$(brew --prefix)/bin/gfortran" >> ~/.R/Makevars - echo "FLIBS=-L$(brew --prefix)/lib/gcc/current -lgfortran -lquadmath -lm" >> ~/.R/Makevars - echo "LDFLAGS=-L$(brew --prefix)/lib/gcc/current" >> ~/.R/Makevars - - - name: Set up R ${{ matrix.r-version }} - uses: r-lib/actions/setup-r@v2 - with: - r-version: ${{ matrix.r-version }} - - - name: Install R dependencies - run: | - Rscript -e "install.packages(c('remotes', 'rcmdcheck', 'reticulate', 'renv', 'knitr', 'rmarkdown', 'lsa', 'purrr', 'testthat', 'htmltools'), repos='https://cran.r-project.org')" - Rscript -e "if (getRversion() >= '4.4.0') remotes::install_version('Matrix', version = '1.5.3') else install.packages('Matrix', type = 'binary')" - Rscript -e "remotes::install_version('htmltools', version = '0.5.8')" - Rscript -e "renv::restore()" - - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: '3.10.x' - - - name: Install Python virtualenv - run: pip install virtualenv - - - name: Create Python virtual environment - run: virtualenv flair_env - - - name: Install Python dependencies in virtual environment - run: | - source flair_env/bin/activate - pip install --upgrade pip - pip install scipy==1.12.0 - pip install flair - - - name: Remove Python cache files - run: find . -name '*.pyc' -delete - - - name: Check (with virtual environment) - run: | - source flair_env/bin/activate - R CMD build --no-build-vignettes . - shell: bash + - uses: actions/checkout@v3 + + - name: Update Homebrew + run: brew update + + - name: Install pandoc + run: | + for i in {1..3} + do + brew install pandoc && break || sleep 15 + done + + - name: Install gfortran and configure Makevars + run: | + brew install gcc + mkdir -p ~/.R + touch ~/.R/Makevars + echo "FC=$(brew --prefix)/bin/gfortran" >> ~/.R/Makevars + echo "F77=$(brew --prefix)/bin/gfortran" >> ~/.R/Makevars + echo "FLIBS=-L$(brew --prefix)/lib/gcc/current -lgfortran -lquadmath -lm" >> ~/.R/Makevars + echo "LDFLAGS=-L$(brew --prefix)/lib/gcc/current" >> ~/.R/Makevars + + - name: Set up R ${{ matrix.r-version }} + uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.r-version }} + + - name: Install R dependencies + run: | + # 首先安裝基礎包 + Rscript -e 'install.packages(c("remotes", "rcmdcheck", "reticulate", "renv"), repos="https://cran.r-project.org")' + + # 根據 R 版本安裝相應的 Matrix 版本 + Rscript -e ' + if (getRversion() >= "4.4.0") { + remotes::install_version("Matrix", version = "1.5-3") + } else if (getRversion() >= "4.2.0" && getRversion() < "4.3.0") { + remotes::install_version("Matrix", version = "1.4-1") + } else { + remotes::install_version("Matrix", version = "1.5-1") + } + ' + + # 安裝其他依賴 + Rscript -e 'install.packages(c("knitr", "rmarkdown", "lsa", "purrr", "testthat"), repos="https://cran.r-project.org")' + + # 安裝特定版本的 htmltools + Rscript -e 'remotes::install_version("htmltools", version = "0.5.8")' + + # renv 相關操作 + Rscript -e ' + tryCatch({ + renv::restore() + }, error = function(e) { + message("Error in renv::restore(): ", e$message) + if (!requireNamespace("Matrix", quietly = TRUE)) { + if (getRversion() >= "4.4.0") { + install.packages("Matrix") + } else if (getRversion() >= "4.2.0" && getRversion() < "4.3.0") { + remotes::install_version("Matrix", version = "1.4-1") + } else { + remotes::install_version("Matrix", version = "1.5-1") + } + } + }) + ' + + - name: Set up Python + uses: actions/setup-python@v4 # 更新到 v4 + with: + python-version: '3.10.x' + cache: 'pip' + + - name: Install Python virtualenv + run: pip install virtualenv + + - name: Create Python virtual environment + run: virtualenv flair_env + + - name: Install Python dependencies in virtual environment + run: | + source flair_env/bin/activate + pip install --upgrade pip + pip install scipy==1.12.0 + pip install flair + deactivate + + - name: Remove Python cache files + run: find . -name '*.pyc' -delete + + - name: Check (with virtual environment) + run: | + source flair_env/bin/activate + set -e + R CMD build --no-build-vignettes . + shell: bash diff --git a/.github/workflows/r_ubuntu.yaml b/.github/workflows/r_ubuntu.yaml index bc18b1d7..4133b3b3 100644 --- a/.github/workflows/r_ubuntu.yaml +++ b/.github/workflows/r_ubuntu.yaml @@ -1,165 +1,174 @@ -# name: R-ubuntu -# -# on: -# push: -# branches: -# - main -# pull_request: -# branches: -# - main -# -# jobs: -# R-CMD-check: -# runs-on: ubuntu-20.04 -# -# strategy: -# matrix: -# r-version: ['4.3.2', '4.2.0', '4.2.1'] -# -# steps: -# - uses: actions/checkout@v3 -# -# - name: Cache R dependencies -# uses: actions/cache@v2 -# with: -# path: ~/R/x86_64-pc-linux-gnu-library/ -# key: ${{ runner.os }}-r-${{ hashFiles('**/renv.lock') }} -# restore-keys: ${{ runner.os }}-r- -# -# - name: Setup R -# uses: r-lib/actions/setup-r@v2 -# with: -# use-public-rspm: true -# -# - name: Restore R environment -# run: | -# Rscript -e "if (!requireNamespace('renv', quietly = TRUE)) install.packages('renv')" -# Rscript -e "renv::restore()" -# -# - name: Install additional R packages -# run: Rscript -e 'install.packages(c("knitr", "rmarkdown", "lsa", "purrr", "ggplot2"))' -# shell: bash -# -# - name: Set up Python -# uses: actions/setup-python@v2 -# with: -# python-version: '3.10.x' -# -# - name: Install Python virtualenv -# run: pip install virtualenv -# -# - name: Create Python virtual environment -# run: virtualenv flair_env -# -# - name: Install Python dependencies in virtual environment -# run: | -# source flair_env/bin/activate -# pip install --upgrade pip -# pip install scipy==1.12.0 # test -# pip install flair -# -# - name: Remove Python cache files -# run: find . -name '*.pyc' -delete -# -# - name: Check R environment status -# run: Rscript -e "renv::status()" -# -# - name: Synchronize R environment -# run: Rscript -e "renv::sync()" -# -# - name: Check R package (with virtual environment) -# run: | -# source flair_env/bin/activate -# R CMD build . --no-build-vignettes -# R CMD check *tar.gz --no-build-vignettes --no-manual --no-examples -# shell: bash -# -# + name: R-ubuntu on: - push: - branches: - - main - pull_request: - branches: - - main + push: + branches: + - main + pull_request: + branches: + - main jobs: - R-CMD-check: - runs-on: ubuntu-20.04 - strategy: - matrix: - r-version: ['4.3.2', '4.2.0', '4.2.1'] - - env: - R_LIBS_USER: /home/runner/work/_temp/Library - TZ: UTC - _R_CHECK_SYSTEM_CLOCK_: FALSE - NOT_CRAN: true - RSPM: https://packagemanager.posit.co/cran/__linux__/focal/latest - RENV_CONFIG_REPOS_OVERRIDE: https://packagemanager.posit.co/cran/__linux__/focal/latest - - steps: - - uses: actions/checkout@v3 - - - name: Cache R dependencies - uses: actions/cache@v2 - with: - path: ~/R/x86_64-pc-linux-gnu-library/ - key: ${{ runner.os }}-r-${{ matrix.r-version }}-${{ hashFiles('**/renv.lock') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.r-version }}- - - - name: Setup R - uses: r-lib/actions/setup-r@v2 - with: - use-public-rspm: true - r-version: ${{ matrix.r-version }} - - - name: Restore R environment - run: | - if (!requireNamespace('renv', quietly = TRUE)) install.packages('renv') - renv::restore() - shell: Rscript {0} - - - name: Install additional R packages - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Use the default GitHub token for authentication - run: | - install.packages(c("knitr", "rmarkdown", "lsa", "purrr", "ggplot2")) - install.packages('remotes') - remotes::install_github("davidycliao/flaiR", auth_token = Sys.getenv("GITHUB_TOKEN"), force = TRUE) - shell: Rscript {0} - - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: '3.10.x' - - - name: Install Python virtualenv - run: pip install virtualenv - - - name: Create Python virtual environment - run: virtualenv flair_env - - - name: Install Python dependencies in virtual environment - run: | - source flair_env/bin/activate - pip install --upgrade pip - pip install scipy==1.12.0 - pip install flair - pip install gensim - - - name: Remove Python cache files - run: find . -name '*.pyc' -delete - - - name: Check R environment status - run: renv::status() - shell: Rscript {0} - - # - name: Check R package (with virtual environment) - # run: | - # source flair_env/bin/activate - # R CMD build . --no-build-vignettes - # R CMD check *tar.gz --no-build-vignettes --no-manual --no-tests --no-examples - # shell: bash + R-CMD-check: + runs-on: ubuntu-20.04 + strategy: + matrix: + r-version: ['4.3.2', '4.2.0', '4.2.1'] + fail-fast: false + + env: + R_LIBS_USER: /home/runner/work/_temp/Library + TZ: UTC + R_CHECK_SYSTEM_CLOCK: FALSE + NOT_CRAN: true + RSPM: https://packagemanager.posit.co/cran/__linux__/focal/latest + RENV_CONFIG_REPOS_OVERRIDE: https://packagemanager.posit.co/cran/__linux__/focal/latest + + steps: + - uses: actions/checkout@v3 + + # 添加磁盤清理步驟 + - name: Free Disk Space + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /usr/local/lib/android + sudo rm -rf /opt/ghc + sudo rm -rf /opt/hostedtoolcache/CodeQL + df -h + + - name: Cache R dependencies + uses: actions/cache@v3 + with: + path: ~/R/x86_64-pc-linux-gnu-library/ + key: ${{ runner.os }}-r-${{ matrix.r-version }}-${{ hashFiles('**/renv.lock') }} + restore-keys: ${{ runner.os }}-r-${{ matrix.r-version }}- + + - name: Setup R + uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + r-version: ${{ matrix.r-version }} + + # Install Matrix first with version control + - name: Install R Matrix package + run: | + install.packages('remotes') + version <- getRversion() + tryCatch({ + if (version >= "4.3.2") { + remotes::install_version("Matrix", version = "1.5-1", repos = "https://cran.r-project.org") + } else if (version >= "4.2.0" && version < "4.3.0") { + remotes::install_version("Matrix", version = "1.4-1", repos = "https://cran.r-project.org", force = TRUE) + } else if (version >= "4.1.0" && version < "4.2.0") { + remotes::install_version("Matrix", version = "1.3-4", repos = "https://cran.r-project.org") + } + message(sprintf("R version: %s, Matrix version: %s", version, packageVersion("Matrix"))) + }, error = function(e) { + message("Error installing Matrix: ", e$message) + if (version < "4.3.0") { + message("Attempting fallback installation...") + remotes::install_version("Matrix", version = "1.4-1", + repos = "https://cran.r-project.org", + force = TRUE) + } + }) + shell: Rscript {0} + + # Verify Matrix installation + - name: Verify Matrix Installation + run: | + tryCatch({ + if (!requireNamespace("Matrix", quietly = TRUE)) { + message("Matrix package not installed properly") + quit(status = 1) + } + version <- as.character(packageVersion("Matrix")) + r_version <- getRversion() + message("Current configuration:") + message("R version: ", r_version) + message("Matrix version: ", version) + + # 驗證版本相容性 + if (r_version >= "4.3.2" && version != "1.5-1") { + message("Warning: Unexpected Matrix version for R 4.3.2+") + } else if (r_version >= "4.2.0" && r_version < "4.3.0" && version != "1.4-1") { + message("Warning: Unexpected Matrix version for R 4.2.x") + } else if (r_version >= "4.1.0" && r_version < "4.2.0" && version != "1.3-4") { + message("Warning: Unexpected Matrix version for R 4.1.x") + } + }, error = function(e) { + message("Error during Matrix verification: ", e$message) + quit(status = 1) + }) + shell: Rscript {0} + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10.x' + + - name: Setup pip cache + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Create and configure venv + run: | + python -m venv flair_env + source flair_env/bin/activate + mkdir -p ~/.cache/pip + pip install --upgrade pip + # 分步安裝以減少內存使用 + pip install --no-cache-dir scipy==1.12.0 + pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu + pip install --no-cache-dir flair + pip install --no-cache-dir gensim==4.3.2 + deactivate + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends \ + libcurl4-openssl-dev \ + libssl-dev \ + libxml2-dev \ + libfontconfig1-dev \ + libharfbuzz-dev \ + libfribidi-dev \ + libfreetype6-dev \ + libpng-dev \ + libtiff5-dev \ + libjpeg-dev + + - name: Install base R dependencies + run: | + options(repos = c(CRAN = "https://cloud.r-project.org")) + install.packages(c("remotes", "rcmdcheck", "reticulate", "renv", "knitr", + "rmarkdown", "lsa", "purrr", "testthat", "ggplot2")) + shell: Rscript {0} + + - name: Restore R environment + run: | + if (!requireNamespace('renv', quietly = TRUE)) install.packages('renv') + options(warn = 2) + tryCatch({ + renv::restore() + }, error = function(e) { + message("Error in renv::restore(): ", e$message) + install.packages(c("knitr", "rmarkdown", "lsa", "purrr", "ggplot2")) + }) + shell: Rscript {0} + + - name: Check R package + run: | + source flair_env/bin/activate + export TMPDIR="/tmp/R-pkg-tmp" + mkdir -p $TMPDIR + R CMD build . --no-build-vignettes + R CMD check *tar.gz --no-build-vignettes --no-manual --no-tests --no-examples || true + deactivate + shell: bash diff --git a/.gitignore b/.gitignore index 1c393a6e..2256e6d5 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ files/directories ^tmp.* vignettes/vignettes/inst classifier +/R/note.R diff --git a/DESCRIPTION b/DESCRIPTION index ac0b56ad..f5cd1cb6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: flaiR Title: An R Wrapper for Accessing FLAIR -Version: 0.0.6 +Version: 0.0.7 Authors@R: c( person("Yen-Chieh", "Liao", email = "davidyclaio@gmail.com", role = c("cre","aut", "ctb")), person("Stefan", "Müller", email = "stefan.mueller@ucd.ie", role = c("aut", "ctb")), @@ -36,6 +36,7 @@ Suggests: purrr, jsonlite, ggplot2, + plotly, testthat (>= 3.0.0) RoxygenNote: 7.3.2 VignetteBuilder: knitr, rmarkdown diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..29ef7410 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,30 @@ +FROM r-base:latest +LABEL maintainer="Yen-Chieh Liao " +# 安裝系統依賴 +RUN apt-get update && apt-get install -y \ + python3 \ + python3-pip \ + libcurl4-openssl-dev \ + libssl-dev \ + libxml2-dev + +# 安裝 Python Flair NLP +RUN pip3 install flair + +# 安裝 R 依賴項 +RUN R -e "install.packages('remotes')" +RUN R -e "install.packages(c('data.table', 'reticulate', 'curl', 'attempt', 'htmltools', 'stringr'))" +RUN R -e "install.packages(c('knitr', 'rmarkdown', 'lsa', 'purrr', 'jsonlite', 'ggplot2', 'plotly', 'testthat'))" + +# 複製 R 套件到容器中 +COPY . /usr/src/my_pkg +WORKDIR /usr/src/my_pkg + +# 安裝 R 套件 +RUN R -e "remotes::install_local(force = TRUE)" + +# 清理不必要的文件 +RUN rm -rf /usr/src/my_pkg + +# 設定預設命令 +CMD ["R"] diff --git a/NAMESPACE b/NAMESPACE index 733305bc..47a73ecd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,18 +22,16 @@ export(flair_splitter) export(flair_splitter.SegtokSentenceSplitter) export(flair_trainers) export(get_entities) -export(get_entities_batch) export(get_pos) -export(get_pos_batch) -export(get_sentiments) -export(get_sentiments_batch) +export(get_tagger_tags) export(highlight_text) export(import_flair) export(install_python_package) export(load_tagger_ner) export(load_tagger_pos) -export(load_tagger_sentiments) export(map_entities) +export(predict_label) +export(process_embeddings) export(show_flair_cache) export(uninstall_python_package) import(reticulate) @@ -43,9 +41,7 @@ importFrom(data.table,data.table) importFrom(data.table,rbindlist) importFrom(htmltools,HTML) importFrom(reticulate,import) -importFrom(reticulate,py_available) importFrom(reticulate,py_get_attr) importFrom(reticulate,py_install) importFrom(reticulate,py_module_available) -importFrom(stats,setNames) importFrom(stringr,str_replace_all) diff --git a/NEWS.md b/NEWS.md index b130e799..66f0ecc4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,24 @@ +# flaiR 0.0.7 (2024-12-26) + +
+ +### _Enhancements_ + +- Added new tutorial section for embedding extraction from Flair NLP, focusing on embedding-based regression analysis + +### _Function_ + +- Unified POS tagging functions: Combined get_pos() and get_pos_batch() +- Unified entity recognition: Merged get_entities() and get_entities_batch() +- Removed sentiment analysis functions: Deprecated get_sentiment() and get_sentiment_batch()ve get_sentiment() and get_sentimen_batch. merge get_pos and get_pos_batch& get_enetities and get_enetities_batch. + +
+ + + + + + # flaiR 0.0.6 (2023-10-29)
diff --git a/R/flaiR_loader.R b/R/flaiR_loader.R deleted file mode 100644 index 6f427ffd..00000000 --- a/R/flaiR_loader.R +++ /dev/null @@ -1,164 +0,0 @@ -#' @title Load the Named Entity Recognition (NER) Tagger -#' -#' @description A helper function to load the appropriate tagger based on the provided language. -#' This function supports a variety of languages/models. -#' -#' @param language A character string indicating the desired language for the NER tagger. -#' If `NULL`, the function will default to the 'pos-fast' model. -#' Supported languages and their models include: -#' \itemize{ -#' \item `"en"` - English NER tagging (`ner`) -#' \item `"de"` - German NER tagging (`de-ner`) -#' \item `"fr"` - French NER tagging (`fr-ner`) -#' \item `"nl"` - Dutch NER tagging (`nl-ner`) -#' \item `"da"` - Danish NER tagging (`da-ner`) -#' \item `"ar"` - Arabic NER tagging (`ar-ner`) -#' \item `"ner-fast"` - English NER fast model (`ner-fast`) -#' \item `"ner-large"` - English NER large mode (`ner-large`) -#' \item `"de-ner-legal"` - NER (legal text) (`de-ner-legal`) -#' \item `"nl"` - Dutch NER tagging (`nl-ner`) -#' \item `"da"` - Danish NER tagging (`da-ner`) -#' \item `"ar"` - Arabic NER tagging (`ar-ner`) -#'} -#' -#' @return An instance of the Flair SequenceTagger for the specified language. -#' -#' @import reticulate -#' @importFrom stats setNames -#' -#' @examples -#' # Load the English NER tagger -#' tagger_en <- load_tagger_ner("en") -#' -#' @export -load_tagger_ner <- function(language = NULL) { - supported_lan_models <- c("ner", "de-ner", - "fr-ner", "nl-ner", - "da-ner", "ar-ner", - "ner-fast", "ner-large", - "ner-pooled", "ner-ontonotes", - "ner-ontonotes-fast", "ner-ontonotes-large", - "de-ner-large", "de-ner-germeval", - "de-ner-legal", "es-ner", - "nl-ner", "nl-ner-large", - "nl-ner-rnn", "ner-ukrainian") - language_model_map <- setNames(supported_lan_models, c("en", "de", - "fr", "nl", - "da", "ar", - "ner-fast", "ner-large", - "ner-pooled", "ner-ontonotes", - "ner-ontonotes-fast", "ner-ontonotes-large", - "de-ner-large", "de-ner-germeval", - "de-ner-legal", "es-ner-large", - "nl-ner", "nl-ner-large", - "nl-ner-rnn", "ner-ukrainian") - ) - - if (is.null(language)) { - language <- "en" - message("Language is not specified. ", language, " in Flair is forceloaded. Please ensure that the internet connectivity is stable.") - } - - # Translate language to model name if necessary - if (language %in% names(language_model_map)) { - language <- language_model_map[[language]] - } - - # Ensure the model is supported - check_language_supported(language = language, supported_lan_models = supported_lan_models) - - # Load the model - SequenceTagger <- reticulate::import("flair.models")$SequenceTagger - SequenceTagger$load(language) -} - -#' @title Load Flair POS Tagger -#' -#' @description This function loads the POS (part-of-speech) tagger model for a specified language -#' using the Flair library. If no language is specified, it defaults to 'pos-fast'. -#' -#' @param language A character string indicating the desired language model. If `NULL`, -#' the function will default to the 'pos-fast' model. Supported language models include: -#' \itemize{ -#' \item "pos" - General POS tagging -#' \item "pos-fast" - Faster POS tagging -#' \item "upos" - Universal POS tagging -#' \item "upos-fast" - Faster Universal POS tagging -#' \item "pos-multi" - Multi-language POS tagging -#' \item "pos-multi-fast" - Faster Multi-language POS tagging -#' \item "ar-pos" - Arabic POS tagging -#' \item "de-pos" - German POS tagging -#' \item "de-pos-tweets" - German POS tagging for tweets -#' \item "da-pos" - Danish POS tagging -#' \item "ml-pos" - Malayalam POS tagging -#' \item "ml-upos" - Malayalam Universal POS tagging -#' \item "pt-pos-clinical" - Clinical Portuguese POS tagging -#' \item "pos-ukrainian" - Ukrainian POS tagging -#' } -#' @return A Flair POS tagger model corresponding to the specified (or default) language. -#' -#' @importFrom reticulate import -#' @export -#' @examples -#' \dontrun{ -#' tagger <- load_tagger_pos("pos-fast") -#' } -load_tagger_pos <- function(language = NULL) { - supported_lan_models <- c("pos", "pos-fast", "upos", "upos-fast", - "pos-multi", "pos-multi-fast", "ar-pos", "de-pos", - "de-pos-tweets", "da-pos", "ml-pos", - "ml-upos", "pt-pos-clinical", "pos-ukrainian") - - if (is.null(language)) { - language <- "pos-fast" - message("Language is not specified. ", language, "in Flair is forceloaded. Please ensure that the internet connectivity is stable. \n") - } - - # Ensure the model is supported - check_language_supported(language = language, supported_lan_models = supported_lan_models) - - # Load the model - flair <- reticulate::import("flair") - Classifier <- flair$nn$Classifier - tagger <- Classifier$load(language) -} - -#' @title Load a Sentiment or Language Tagger Model from Flair -#' -#' @description This function loads a pre-trained sentiment or language tagger -#' from the Flair library. -#' -#' @param language A character string specifying the language model to load. -#' Supported models include: -#' \itemize{ -#' \item "sentiment" - Sentiment analysis model -#' \item "sentiment-fast" - Faster sentiment analysis model -#' \item "de-offensive-language" - German offensive language detection model -#'} If not provided, the function will default to the "sentiment" model. -#' -#' @return An object of the loaded Flair model. -#' -#' @import reticulate -#' @examples -#' \dontrun{ -#' tagger <- load_tagger_sentiments("sentiment") -#' } -#' -#' @export -load_tagger_sentiments <- function(language = NULL) { - supported_lan_models <- c("sentiment", "sentiment-fast", "de-offensive-language") - - if (is.null(language)) { - language <- "sentiment" - message("Language is not specified. ", language, " in Flair is forceloaded. Please ensure that the internet connectivity is stable.") - } - - # Ensure the model is supported - check_language_supported(language = language, supported_lan_models = supported_lan_models) - - # Load the model - flair <- reticulate::import("flair") - Classifier <- flair$nn$Classifier - tagger <- Classifier$load(language) - return(tagger) -} diff --git a/R/flair_loaders.R b/R/flair_loaders.R new file mode 100644 index 00000000..a85b34ec --- /dev/null +++ b/R/flair_loaders.R @@ -0,0 +1,309 @@ +#' @title Load and Configure NER Tagger +#' +#' @description Loads a Named Entity Recognition model from Flair and displays +#' its tag dictionary. Supports both standard NER and OntoNotes models. +#' +#' @param model_name Character string specifying the model to load. +#' Can be "ner" (default), "flair/ner-english-large", or "flair/ner-english-ontonotes" +#' @param show_tags Logical, whether to display the tag dictionary. +#' Default is TRUE. +#' +#' @return A Flair SequenceTagger model object +#' @export +load_tagger_ner <- function(model_name = "ner", show_tags = TRUE) { + if (is.null(model_name)) { + model_name <- "ner" + message("Model name is not specified. Using default 'ner' model.") + } + + # Load the model + tryCatch({ + SequenceTagger <- flair_models()$SequenceTagger + tagger <- SequenceTagger$load(model_name) + + # Extract and organize tags if requested + if (show_tags) { + tag_dict <- tagger$label_dictionary + tag_list <- tag_dict$get_items() + + # Function to extract specific entity types + get_entity_tags <- function(pattern, tags) { + grep(pattern, tags, value = TRUE) + } + + # Group tags by category (supporting both standard and OntoNotes) + categories <- list( + special = get_entity_tags("^<.*>$|^O$", tag_list), + person = get_entity_tags("PERSON", tag_list), + organization = get_entity_tags("ORG", tag_list), + location = get_entity_tags("LOC|GPE", tag_list), + time = get_entity_tags("TIME|DATE", tag_list), + numbers = get_entity_tags("CARDINAL|ORDINAL|PERCENT|MONEY", tag_list), + groups = get_entity_tags("NORP", tag_list), # Nationalities, religious or political groups + facilities = get_entity_tags("FAC", tag_list), # Buildings, airports, highways, bridges + products = get_entity_tags("PRODUCT", tag_list), + events = get_entity_tags("EVENT", tag_list), + art = get_entity_tags("WORK_OF_ART", tag_list), + languages = get_entity_tags("LANGUAGE", tag_list), + laws = get_entity_tags("LAW", tag_list), + misc = get_entity_tags("MISC", tag_list) + ) + + # Print organized output + cat("\nNER Tagger Dictionary:\n") + cat("========================================\n") + cat(sprintf("Total tags: %d\n", length(tag_list))) + cat(sprintf("Model: %s\n", model_name)) + cat("----------------------------------------\n") + + # Print categories with tags + for (cat_name in names(categories)) { + tags <- categories[[cat_name]] + if (length(tags) > 0) { + # Format category name + formatted_name <- gsub("_", " ", tools::toTitleCase(cat_name)) + cat(sprintf("%-15s: %s\n", + formatted_name, + paste(tags, collapse = ", "))) + } + } + + cat("----------------------------------------\n") + cat("Tag scheme: BIOES\n") + cat("B-: Beginning of multi-token entity\n") + cat("I-: Inside of multi-token entity\n") + cat("O: Outside (not part of any entity)\n") + cat("E-: End of multi-token entity\n") + cat("S-: Single token entity\n") + cat("========================================\n") + } + + return(tagger) + + }, error = function(e) { + stop(sprintf( + "Error loading model: %s\nPlease check:\n - Model name is correct\n - Internet connection is stable\n - You have sufficient permissions\nError: %s", + model_name, e$message + )) + }) +} + + +#' @title Extract Model Tags +#' +#' @description Helper function to extract and categorize tags from a loaded Flair +#' SequenceTagger model. The tags are grouped into categories such as person, +#' organization, location, and miscellaneous. +#' +#' @param tagger A loaded Flair SequenceTagger model +#' +#' @return A list of tags grouped by category: +#' \describe{ +#' \item{all}{Complete list of all available tags} +#' \item{special}{Special tags like , O, , } +#' \item{person}{Person-related tags (e.g., B-PER, I-PER)} +#' \item{organization}{Organization tags (e.g., B-ORG, E-ORG)} +#' \item{location}{Location tags (e.g., B-LOC, S-LOC)} +#' \item{misc}{Miscellaneous entity tags} +#' } +#' +#' @details +#' The tags follow the BIOES (Begin, Inside, Outside, End, Single) scheme: +#' \itemize{ +#' \item{B-: Beginning of multi-token entity (e.g., B-PER in "John Smith")} +#' \item{I-: Inside of multi-token entity (e.g., I-PER in "John Smith")} +#' \item{O: Outside of any entity} +#' \item{E-: End of multi-token entity} +#' \item{S-: Single token entity (e.g., S-LOC in "Paris")} +#' } +#' +#' @examples +#' \dontrun{ +#' # Load a NER model +#' tagger <- load_tagger_ner("flair/ner-english-large") +#' +#' # Extract all tags +#' tags <- get_tagger_tags(tagger) +#' +#' # Access specific tag categories +#' print(tags$person) # All person-related tags +#' print(tags$location) # All location-related tags +#' +#' # Example usage with text annotation +#' # B-PER I-PER O S-ORG +#' # "John Smith works at Google" +#' +#' # B-LOC E-LOC O B-ORG E-ORG +#' # "New York is United Nations headquarters" +#' +#' # Use tags to filter entities +#' person_entities <- results[tag %in% tags$person] +#' org_entities <- results[tag %in% tags$organization] +#' } +#' +#' @seealso +#' \code{\link{load_tagger_ner}} for loading the NER model +#' +#' @export +get_tagger_tags <- function(tagger) { + tag_dict <- tagger$label_dictionary + tag_list <- tag_dict$get_items() + + list( + all = tag_list, + special = grep("^<.*>$|^O$", tag_list, value = TRUE), + person = grep("PER", tag_list, value = TRUE), + organization = grep("ORG", tag_list, value = TRUE), + location = grep("LOC", tag_list, value = TRUE), + misc = grep("MISC", tag_list, value = TRUE) + ) +} + + +#' @title Load Flair POS Tagger +#' +#' @description This function loads the POS (part-of-speech) tagger model for a specified language +#' using the Flair library. If no language is specified, it defaults to 'pos-fast'. +#' +#' @param language A character string indicating the desired language model. If `NULL`, +#' the function will default to the 'pos-fast' model. Supported language models include: +#' \itemize{ +#' \item "pos" - General POS tagging +#' \item "pos-fast" - Faster POS tagging +#' \item "upos" - Universal POS tagging +#' \item "upos-fast" - Faster Universal POS tagging +#' \item "pos-multi" - Multi-language POS tagging +#' \item "pos-multi-fast" - Faster Multi-language POS tagging +#' \item "ar-pos" - Arabic POS tagging +#' \item "de-pos" - German POS tagging +#' \item "de-pos-tweets" - German POS tagging for tweets +#' \item "da-pos" - Danish POS tagging +#' \item "ml-pos" - Malayalam POS tagging +#' \item "ml-upos" - Malayalam Universal POS tagging +#' \item "pt-pos-clinical" - Clinical Portuguese POS tagging +#' \item "pos-ukrainian" - Ukrainian POS tagging +#' } +#' @return A Flair POS tagger model corresponding to the specified (or default) language. +#' +#' @importFrom reticulate import +#' @export +#' @examples +#' \dontrun{ +#' tagger <- load_tagger_pos("pos-fast") +#' } +# load_tagger_pos <- function(language = NULL) { +# supported_lan_models <- c("pos", "pos-fast", "upos", "upos-fast", +# "pos-multi", "pos-multi-fast", "ar-pos", "de-pos", +# "de-pos-tweets", "da-pos", "ml-pos", +# "ml-upos", "pt-pos-clinical", "pos-ukrainian") +# +# if (is.null(language)) { +# language <- "pos-fast" +# message("Language is not specified. ", language, "in Flair is forceloaded. Please ensure that the internet connectivity is stable. \n") +# } +# +# # Ensure the model is supported +# check_language_supported(language = language, supported_lan_models = supported_lan_models) +# +# # Load the model +# flair <- reticulate::import("flair") +# Classifier <- flair$nn$Classifier +# tagger <- Classifier$load(language) +# } + +#' @title Load POS (Part-of-Speech) Tagger Model +#' +#' @description Loads a Part-of-Speech tagging model from Flair and displays +#' its tag dictionary in organized categories. +#' +#' @param model_name Character string specifying the model to load. +#' Default is "pos-fast". +#' @param show_tags Logical, whether to display the tag dictionary. +#' Default is TRUE. +#' +#' @return A Flair tagger model object for POS tagging +#' @export +load_tagger_pos <- function(model_name = "pos-fast", show_tags = TRUE) { + if (is.null(model_name)) { + model_name <- "pos-fast" + message("Model name not specified. Using default 'pos-fast' model.") + } + + # Load the model + tryCatch({ + flair <- reticulate::import("flair") + Classifier <- flair$nn$Classifier + + message("Loading POS tagger model: ", model_name) + tagger <- Classifier$load(model_name) + + # Display tag dictionary if requested + if (show_tags) { + tag_dict <- tagger$label_dictionary + tag_list <- tag_dict$get_items() + + # Group tags by category + categories <- list( + special = grep("^<.*>$|^O$", tag_list, value = TRUE), # Special tags + noun = grep("^NN|^PRP|^WP|^EX", tag_list, value = TRUE), # Nouns, pronouns + verb = grep("^VB|^MD", tag_list, value = TRUE), # Verbs, modals + adj = grep("^JJ|^POS", tag_list, value = TRUE), # Adjectives + adv = grep("^RB|^WRB", tag_list, value = TRUE), # Adverbs + det = grep("^DT|^WDT|^PDT", tag_list, value = TRUE), # Determiners + prep = grep("^IN|^TO", tag_list, value = TRUE), # Prepositions + conj = grep("^CC", tag_list, value = TRUE), # Conjunctions + num = grep("^CD", tag_list, value = TRUE), # Numbers + punct = grep("^[[:punct:]]|^-[LR]RB-|^HYPH|^NFP", tag_list, value = TRUE), # Punctuation + other = grep("^FW|^SYM|^ADD|^XX|^UH|^LS|^\\$", tag_list, value = TRUE) # Others + ) + + # Print organized output + cat("\nPOS Tagger Dictionary:\n") + cat("========================================\n") + cat(sprintf("Total tags: %d\n", length(tag_list))) + cat("----------------------------------------\n") + + # Print each category + if (length(categories$special) > 0) + cat("Special: ", paste(categories$special, collapse = ", "), "\n") + if (length(categories$noun) > 0) + cat("Nouns: ", paste(categories$noun, collapse = ", "), "\n") + if (length(categories$verb) > 0) + cat("Verbs: ", paste(categories$verb, collapse = ", "), "\n") + if (length(categories$adj) > 0) + cat("Adjectives: ", paste(categories$adj, collapse = ", "), "\n") + if (length(categories$adv) > 0) + cat("Adverbs: ", paste(categories$adv, collapse = ", "), "\n") + if (length(categories$det) > 0) + cat("Determiners: ", paste(categories$det, collapse = ", "), "\n") + if (length(categories$prep) > 0) + cat("Prepositions: ", paste(categories$prep, collapse = ", "), "\n") + if (length(categories$conj) > 0) + cat("Conjunctions: ", paste(categories$conj, collapse = ", "), "\n") + if (length(categories$num) > 0) + cat("Numbers: ", paste(categories$num, collapse = ", "), "\n") + if (length(categories$punct) > 0) + cat("Punctuation: ", paste(categories$punct, collapse = ", "), "\n") + if (length(categories$other) > 0) + cat("Others: ", paste(categories$other, collapse = ", "), "\n") + + cat("----------------------------------------\n") + cat("Common POS Tag Meanings:\n") + cat("NN*: Nouns (NNP: Proper, NNS: Plural)\n") + cat("VB*: Verbs (VBD: Past, VBG: Gerund)\n") + cat("JJ*: Adjectives (JJR: Comparative)\n") + cat("RB*: Adverbs\n") + cat("PRP: Pronouns, DT: Determiners\n") + cat("IN: Prepositions, CC: Conjunctions\n") + cat("========================================\n") + } + + return(tagger) + + }, error = function(e) { + stop(sprintf( + "Error loading POS model: %s\n Please check:\n - Model name is correct\n - Internet connection is stable\n - You have sufficient permissions\nError: %s", + model_name, e$message + )) + }) +} diff --git a/R/get_entities.R b/R/get_entities.R index bcce0100..3d34f71e 100644 --- a/R/get_entities.R +++ b/R/get_entities.R @@ -1,75 +1,175 @@ -#' @title Tagging Named Entities with Flair Models +#' @title Check if Tagger is Valid #' -#' @description This function takes texts and their corresponding document IDs -#' as inputs, uses the Flair NLP library to extract named entities, -#' and returns a dataframe of the identified entities along with their tags. -#' When no entities are detected in a text, the function returns a data table -#' with NA values. This might clutter the results. Depending on your use case, -#' you might decide to either keep this behavior or skip rows with no detected -#' entities. +#' @description Internal function to verify if the provided tagger is valid +#' and has the required methods. +#' +#' @param tagger A Flair tagger object to check +#' @return Logical indicating if the tagger is valid +#' @keywords internal +check_tagger <- function(tagger) { + if (is.null(tagger)) { + stop("Tagger cannot be NULL. Please provide a valid Flair tagger object.", + "\nExample: tagger_ner <- load_tagger_ner('ner')") + } + + # Check if tagger has required methods + required_methods <- c("predict", "to") + has_methods <- sapply(required_methods, function(method) { + !is.null(tagger[[method]]) && is.function(tagger[[method]]) + }) + + if (!all(has_methods)) { + missing_methods <- required_methods[!has_methods] + stop("Invalid tagger object. Missing required methods: ", + paste(missing_methods, collapse = ", ")) + } + + return(TRUE) +} +#' @title Extract Named Entities from Texts with Batch Processing +#' +#' @description This function processes texts in batches and extracts named entities +#' using the Flair NLP library. It supports both standard NER and OntoNotes models, +#' with options for batch processing and GPU acceleration. #' #' @param texts A character vector containing the texts to process. #' @param doc_ids A character or numeric vector containing the document IDs #' corresponding to each text. -#' @param tagger An optional tagger object. If NULL (default), the function will -#' load a Flair tagger based on the specified language. -#' @param language A character string indicating the language model to load. -#' Default is "en". +#' @param tagger A Flair tagger object for named entity recognition. Must be provided +#' by the user. Can be created using load_tagger_ner() with different models: +#' \itemize{ +#' \item Standard NER: tagger_ner <- load_tagger_ner('ner') +#' \item OntoNotes: tagger_ner <- load_tagger_ner('flair/ner-english-ontonotes') +#' \item Large model: tagger_ner <- load_tagger_ner('flair/ner-english-large') +#' } #' @param show.text_id A logical value. If TRUE, includes the actual text from -#' which the entity was extracted in the resulting data table. Useful for -#' verification and traceability purposes but might increase the size of -#' the output. Default is FALSE. +#' which the entity was extracted. Default is FALSE. #' @param gc.active A logical value. If TRUE, runs the garbage collector after -#' processing all texts. This can help in freeing up memory by releasing unused -#' memory space, especially when processing a large number of texts. -#' Default is FALSE. +#' processing texts. Default is FALSE. +#' @param batch_size An integer specifying the size of each batch. Set to 1 for +#' single-text processing. Default is 5. +#' @param device A character string specifying the computation device ("cpu", +#' "cuda:0", "cuda:1", etc.). Default is "cpu". Note: MPS (Mac M1/M2) is currently +#' not fully supported and will default to CPU. +#' @param verbose A logical value. If TRUE, prints processing progress. Default is FALSE. +#' #' @return A data table with columns: #' \describe{ -#' \item{doc_id}{The ID of the document from which the entity was extracted.} -#' \item{text_id}{If TRUE, the actual text from which the entity -#' was extracted.} -#' \item{entity}{The named entity that was extracted from the text.} -#' \item{tag}{The tag or category of the named entity. Common tags include: -#' PERSON (names of individuals), -#' ORG (organizations, institutions), -#' GPE (countries, cities, states), -#' LOCATION (mountain ranges, bodies of water), -#' DATE (dates or periods), -#' TIME (times of day), -#' MONEY (monetary values), -#' PERCENT (percentage values), -#' FACILITY (buildings, airports), -#' PRODUCT (objects, vehicles), -#' EVENT (named events like wars or sports events), -#' ART (titles of books)}} +#' \item{doc_id}{Character or numeric. The ID of the document from which the +#' entity was extracted.} +#' \item{text_id}{Character. The complete text from which the entity was +#' extracted. Only included when show.text_id = TRUE.} +#' \item{entity}{Character. The actual named entity text that was extracted. +#' Will be NA if no entity was found.} +#' \item{tag}{Character. The category of the named entity. Available tags depend on +#' the model used: +#' \itemize{ +#' \item{Standard NER tags:} +#' \itemize{ +#' \item{PERSON: Names of people} +#' \item{ORG: Organizations} +#' \item{LOC: Locations} +#' \item{MISC: Miscellaneous entities} +#' } +#' \item{OntoNotes tags:} +#' \itemize{ +#' \item{PERSON: People, including fictional characters} +#' \item{ORG: Companies, agencies, institutions} +#' \item{GPE: Countries, cities, states} +#' \item{LOC: Non-GPE locations, mountains, water bodies} +#' \item{DATE: Absolute or relative dates} +#' \item{TIME: Times of day} +#' \item{MONEY: Monetary values} +#' \item{PERCENT: Percentage values} +#' \item{CARDINAL: Numerals} +#' \item{ORDINAL: Ordinal numbers} +#' \item{NORP: Nationalities, religious, or political groups} +#' \item{FAC: Buildings, airports, highways, bridges} +#' \item{WORK_OF_ART: Titles of books, songs, etc.} +#' \item{LAW: Named documents made into laws} +#' \item{LANGUAGE: Named languages} +#' } +#' }} +#' } +#' +#' @section Tag Format: +#' All tags use the BIOES (Begin, Inside, Outside, End, Single) scheme: +#' \itemize{ +#' \item{B-: Beginning of multi-token entity (e.g., B-PERSON in "John Smith")} +#' \item{I-: Inside of multi-token entity} +#' \item{O: Outside (not part of any entity)} +#' \item{E-: End of multi-token entity} +#' \item{S-: Single token entity (e.g., S-LOC in "Paris")} +#' } +#' #' @examples #' \dontrun{ #' library(reticulate) #' library(fliaR) #' -#' texts <- c("UCD is one of the best universities in Ireland.", -#' "UCD has a good campus but is very far from -#' my apartment in Dublin.", -#' "Essex is famous for social science research.", -#' "Essex is not in the Russell Group, but it is -#' famous for political science research.", -#' "TCD is the oldest university in Ireland.", -#' "TCD is similar to Oxford.") -#' doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") -#' # Load NER ("ner") model -#' tagger_ner <- load_tagger_ner('ner') -#' results <- get_entities(texts, doc_ids, tagger_ner) -#' print(results)} +#' # Using standard NER model +#' tagger_std <- load_tagger_ner('ner') +#' +#' # Using OntoNotes model +#' tagger_onto <- load_tagger_ner('flair/ner-english-ontonotes') +#' +#' texts <- c( +#' "John Smith works at Google in New York.", +#' "The Eiffel Tower was built in 1889." +#' ) +#' doc_ids <- c("doc1", "doc2") +#' +#' # Process with standard NER +#' results_std <- get_entities( +#' texts = texts, +#' doc_ids = doc_ids, +#' tagger = tagger_std, +#' batch_size = 2, +#' verbose = TRUE +#' ) +#' +#' # Process with OntoNotes model +#' results_onto <- get_entities( +#' texts = texts, +#' doc_ids = doc_ids, +#' tagger = tagger_onto, +#' batch_size = 2, +#' verbose = TRUE +#' ) +#' +#' # Filter specific entity types +#' persons <- results_onto[grepl("PERSON", tag)] +#' locations <- results_onto[grepl("LOC|GPE", tag)] +#' dates <- results_onto[grepl("DATE", tag)] +#' } #' #' @importFrom data.table data.table rbindlist #' @importFrom reticulate import #' @importFrom data.table := #' @export -get_entities <- function(texts, doc_ids = NULL, tagger = NULL, language = NULL, - show.text_id = FALSE, gc.active = FALSE) { +get_entities <- function(texts, doc_ids = NULL, tagger, show.text_id = FALSE, + gc.active = FALSE, batch_size = 5, device = "cpu", + verbose = FALSE) { + + if (verbose) { + message("Starting entity extraction process...") + message("Checking tagger...") + } - # Check environment pre-requisites + # Check tagger + check_tagger(tagger) + + if (verbose) { + message("Tagger validation successful") + message("Number of texts to process: ", length(texts)) + message("Batch size: ", batch_size) + message("Device: ", device) + } + + # Check environment and parameters check_prerequisites() + check_device(device) + check_batch_size(batch_size) check_show.text_id(show.text_id) # Check and prepare texts and doc_ids @@ -77,45 +177,142 @@ get_entities <- function(texts, doc_ids = NULL, tagger = NULL, language = NULL, texts <- texts_and_ids$texts doc_ids <- texts_and_ids$doc_ids - # Load tagger if null - if (is.null(tagger)) { - tagger <- load_tagger_ner(language) + # Set device for processing + if (device != "cpu") { + tryCatch({ + tagger$to(device) + }, error = function(e) { + warning(sprintf("Error setting device %s: %s\nDefaulting to CPU.", + device, e$message)) + device <- "cpu" + }) } Sentence <- reticulate::import("flair")$data$Sentence - # Process each text and extract entities - process_text <- function(text, doc_id) { - text_id <- NULL - if (is.na(text) || is.na(doc_id)) { - return(data.table(doc_id = NA, entity = NA, tag = NA)) + # Helper function for progress bar + create_progress_bar <- function(current, total, width = 50) { + percent <- current / total + filled <- round(width * percent) + empty <- width - filled + bar <- paste0( + "[", + strrep("=", filled), + ">", + strrep(" ", empty), + "] ", + sprintf("%3d%%", round(percent * 100)) + ) + return(bar) + } + + # Process batch of texts and extract entities + process_batch <- function(batch_texts, batch_doc_ids, batch_num, total_batches) { + if (verbose) { + progress_text <- sprintf("\rBatch %d/%d %s Processing %d texts...", + batch_num, total_batches, + create_progress_bar(batch_num, total_batches), + length(batch_texts)) + cat(progress_text) } - sentence <- Sentence(text) - tagger$predict(sentence) - entities <- sentence$get_spans("ner") + results_list <- lapply(seq_along(batch_texts), function(i) { + text <- batch_texts[[i]] + doc_id <- batch_doc_ids[[i]] + + if (is.na(text) || is.na(doc_id)) { + if (verbose) message("Skipping NA text or doc_id") + return(data.table( + doc_id = NA, + entity = NA, + tag = NA, + score = NA + )) + } + + # Error handling for sentence prediction + tryCatch({ + sentence <- Sentence(text) + tagger$predict(sentence) + entities <- sentence$get_spans("ner") - if (length(entities) == 0) { - return(data.table(doc_id = doc_id, entity = NA, tag = NA)) + if (length(entities) == 0) { + return(data.table( + doc_id = doc_id, + entity = NA, + tag = NA, + score = NA + )) + } + + # Create data table with entity information using vapply + dt <- data.table( + doc_id = rep(doc_id, length(entities)), + entity = vapply(entities, function(e) e$text, character(1)), + tag = vapply(entities, function(e) e$tag, character(1)), + score = vapply(entities, function(e) e$score, numeric(1)) + ) + + if (isTRUE(show.text_id)) { + dt[, text_id := text] + } + + return(dt) + }, error = function(e) { + warning(sprintf("Error processing text %d: %s", i, e$message)) + return(data.table( + doc_id = doc_id, + entity = NA, + tag = NA, + score = NA + )) + }) + }) + + if (verbose) { + cat("\r", strrep(" ", 80), "\r") # Clear current line + cat(sprintf("Batch %d/%d completed\n", batch_num, total_batches)) } - # Unified data table creation process - dt <- data.table( - doc_id = rep(doc_id, length(entities)), - entity = vapply(entities, function(e) e$text, character(1)), - tag = vapply(entities, function(e) e$tag, character(1)) + return(rbindlist(results_list, fill = TRUE)) + } + + # Process all batches + num_batches <- ceiling(length(texts) / batch_size) + all_results <- lapply(1:num_batches, function(b) { + start_idx <- (b - 1) * batch_size + 1 + end_idx <- min(b * batch_size, length(texts)) + + batch_result <- process_batch( + texts[start_idx:end_idx], + doc_ids[start_idx:end_idx], + b, + num_batches ) - if (isTRUE(show.text_id)) { - dt[, text_id := text] + # Run garbage collection if requested + if (gc.active) { + if (verbose) message("Running garbage collection...") + check_and_gc(gc.active) } - return(dt) + return(batch_result) + }) + + # Combine all results + final_results <- rbindlist(all_results, fill = TRUE) + + # Final summary + if (verbose) { + total_entities <- nrow(final_results[!is.na(entity)]) + message("\n", strrep("=", 60)) + message("Processing Summary:") + message(strrep("-", 60)) + message(sprintf("Total texts processed: %d", length(texts))) + message(sprintf("Total entities found: %d", total_entities)) + message(sprintf("Average entities per text: %.1f", total_entities/length(texts))) + message(strrep("=", 60)) } - # Activate garbage collection - check_and_gc(gc.active) - results_list <- lapply(seq_along(texts), - function(i) {process_text(texts[[i]], doc_ids[[i]])}) - rbindlist(results_list, fill = TRUE) + return(final_results) } diff --git a/R/get_entities_batch.R b/R/get_entities_batch.R index 6ef47956..a60d2d6f 100644 --- a/R/get_entities_batch.R +++ b/R/get_entities_batch.R @@ -1,136 +1,137 @@ -#' @title Extract Named Entities from a Batch of Texts -#' -#' @description This function processes batches of texts and extracts named entities. -#' -#' @param texts A character vector of texts to process. -#' @param doc_ids A vector of document IDs corresponding to each text. -#' @param tagger A pre-loaded Flair NER tagger. Default is NULL, and the tagger is loaded based on the provided language. -#' @param language A character string specifying the language of the texts. Default is "en" (English). -#' @param show.text_id Logical, whether to include the text ID in the output. Default is FALSE. -#' @param gc.active Logical, whether to activate garbage collection after processing each batch. Default is FALSE. -#' @param batch_size An integer specifying the size of each batch. Default is 5. -#' @param device A character string specifying the computation device. -#' It can be either "cpu" or a string representation of a GPU device number. -#' For instance, "0" corresponds to the first GPU. If a GPU device number -#' is provided, it will attempt to use that GPU. The default is "cpu". -#' \itemize{ -#' \item{"cuda" or "cuda:0" ("mps" or "mps:0" in Mac M1/M2 )}{Refers to the first GPU in the system. If -#' there's only one GPU, specifying "cuda" or "cuda:0" will allocate -#' computations to this GPU.} -#' \item{"cuda:1" ("mps:1")}{Refers to the second GPU in the system, allowing allocation -#' of specific computations to this GPU.} -#' \item{"cuda:2" ("mps:2)}{Refers to the third GPU in the system, and so on for systems -#' with more GPUs.} -#' } -#' -#' @param verbose A logical value. If TRUE, the function prints batch processing -#' progress updates. Default is TRUE. -#' -#' @return A data.table containing the extracted entities, their corresponding -#' tags, and document IDs. -#' -#' @importFrom data.table data.table rbindlist -#' @importFrom reticulate import -#' @importFrom data.table := -#' @examples -#' \dontrun{ -#' library(reticulate) -#' library(fliaR) -#' -#' texts <- c("UCD is one of the best universities in Ireland.", -#' "UCD has a good campus but is very far from -#' my apartment in Dublin.", -#' "Essex is famous for social science research.", -#' "Essex is not in the Russell Group, but it is -#' famous for political science research.", -#' "TCD is the oldest university in Ireland.", -#' "TCD is similar to Oxford.") -#' doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") -#' # Load NER ("ner") model -#' tagger_ner <- load_tagger_ner('ner') -#' results <- get_entities_batch(texts, doc_ids, tagger_ner) -#' print(results)} -#' @export -get_entities_batch <- function(texts, doc_ids, tagger = NULL, language = "en", - show.text_id = FALSE, gc.active = FALSE, - batch_size = 5, device = "cpu", verbose = TRUE) { - - # Check environment pre-requisites and parameters - check_prerequisites() - check_device(device) - check_batch_size(batch_size) - check_texts_and_ids(texts, doc_ids) - check_show.text_id(show.text_id) - - # Load tagger if null - if (is.null(tagger)) { - tagger <- load_tagger_ner(language) - } - - Sentence <- reticulate::import("flair")$data$Sentence - - # Entity Extraction function to process multiple texts in one call - process_texts_batch <- function(batch_texts, batch_doc_ids) { - text_id <- NULL - if (length(batch_texts) != length(batch_doc_ids)) { - stop("The lengths of batch_texts and batch_doc_ids do not match.") - } - - results_list <- lapply(seq_along(batch_texts), function(i) { - text <- batch_texts[[i]] - doc_id <- batch_doc_ids[[i]] - - if (is.na(text) || is.na(doc_id)) { - return(data.table(doc_id = NA, - entity = NA, - tag = NA, - text_id = ifelse(show.text_id, text, NA)) - ) - } - - sentence <- Sentence(text) - tagger$predict(sentence) - entities <- sentence$get_spans("ner") - - if (length(entities) == 0) { - return(data.table(doc_id = doc_id, - entity = NA, - tag = NA, - text_id = ifelse(show.text_id, text, NA))) - } - - # Unified data table creation process - dt <- data.table( - doc_id = rep(doc_id, length(entities)), - entity = vapply(entities, function(e) e$text, character(1)), - tag = vapply(entities, function(e) e$tag, character(1)) - ) - - if (isTRUE(show.text_id)) { - dt[, text_id := text] - } - - return(dt) - }) - - return(rbindlist(results_list, fill = TRUE)) # Return the combined result for this batch - } - - # Batch processing - num_batches <- ceiling(length(texts) / batch_size) - batched_results <- lapply(1:num_batches, function(b) { - # Print the progress if verbose is TRUE - if (isTRUE(verbose)) { - cat(sprintf("Processing batch %d out of %d...\n", b, num_batches)) - } - - start_idx <- (b - 1) * batch_size + 1 - end_idx <- min(b * batch_size, length(texts)) - return(process_texts_batch(texts[start_idx:end_idx], doc_ids[start_idx:end_idx])) - }) - - # Activate garbage collection - check_and_gc(gc.active) - - return(rbindlist(batched_results, fill = TRUE)) -} +#' #' @title Extract Named Entities from a Batch of Texts +#' #' +#' #' @description This function processes batches of texts and extracts named entities. +#' #' +#' #' @param texts A character vector of texts to process. +#' #' @param doc_ids A vector of document IDs corresponding to each text. +#' #' @param tagger A pre-loaded Flair NER tagger. Default is NULL, and the tagger is loaded based on the provided language. +#' #' @param language A character string specifying the language of the texts. Default is "en" (English). +#' #' @param show.text_id Logical, whether to include the text ID in the output. Default is FALSE. +#' #' @param gc.active Logical, whether to activate garbage collection after processing each batch. Default is FALSE. +#' #' @param batch_size An integer specifying the size of each batch. Default is 5. +#' #' @param device A character string specifying the computation device. +#' #' It can be either "cpu" or a string representation of a GPU device number. +#' #' For instance, "0" corresponds to the first GPU. If a GPU device number +#' #' is provided, it will attempt to use that GPU. The default is "cpu". +#' #' \itemize{ +#' #' \item{"cuda" or "cuda:0" ("mps" or "mps:0" in Mac M1/M2 )}{Refers to the first GPU in the system. If +#' #' there's only one GPU, specifying "cuda" or "cuda:0" will allocate +#' #' computations to this GPU.} +#' #' \item{"cuda:1" ("mps:1")}{Refers to the second GPU in the system, allowing allocation +#' #' of specific computations to this GPU.} +#' #' \item{"cuda:2" ("mps:2)}{Refers to the third GPU in the system, and so on for systems +#' #' with more GPUs.} +#' #' } +#' #' +#' #' @param verbose A logical value. If TRUE, the function prints batch processing +#' #' progress updates. Default is TRUE. +#' #' +#' #' @return A data.table containing the extracted entities, their corresponding +#' #' tags, and document IDs. +#' #' +#' #' @importFrom data.table data.table rbindlist +#' #' @importFrom reticulate import +#' #' @importFrom data.table := +#' #' @examples +#' #' \dontrun{ +#' #' library(reticulate) +#' #' library(fliaR) +#' #' +#' #' texts <- c("UCD is one of the best universities in Ireland.", +#' #' "UCD has a good campus but is very far from +#' #' my apartment in Dublin.", +#' #' "Essex is famous for social science research.", +#' #' "Essex is not in the Russell Group, but it is +#' #' famous for political science research.", +#' #' "TCD is the oldest university in Ireland.", +#' #' "TCD is similar to Oxford.") +#' #' doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") +#' #' # Load NER ("ner") model +#' #' tagger_ner <- load_tagger_ner('ner') +#' #' results <- get_entities_batch(texts, doc_ids, tagger_ner) +#' #' print(results)} +#' #' @export + +# get_entities_batch <- function(texts, doc_ids, tagger = NULL, language = "en", +# show.text_id = FALSE, gc.active = FALSE, +# batch_size = 5, device = "cpu", verbose = TRUE) { +# +# # Check environment pre-requisites and parameters +# check_prerequisites() +# check_device(device) +# check_batch_size(batch_size) +# check_texts_and_ids(texts, doc_ids) +# check_show.text_id(show.text_id) +# +# # Load tagger if null +# if (is.null(tagger)) { +# tagger <- load_tagger_ner(language) +# } +# +# Sentence <- reticulate::import("flair")$data$Sentence +# +# # Entity Extraction function to process multiple texts in one call +# process_texts_batch <- function(batch_texts, batch_doc_ids) { +# text_id <- NULL +# if (length(batch_texts) != length(batch_doc_ids)) { +# stop("The lengths of batch_texts and batch_doc_ids do not match.") +# } +# +# results_list <- lapply(seq_along(batch_texts), function(i) { +# text <- batch_texts[[i]] +# doc_id <- batch_doc_ids[[i]] +# +# if (is.na(text) || is.na(doc_id)) { +# return(data.table(doc_id = NA, +# entity = NA, +# tag = NA, +# text_id = ifelse(show.text_id, text, NA)) +# ) +# } +# +# sentence <- Sentence(text) +# tagger$predict(sentence) +# entities <- sentence$get_spans("ner") +# +# if (length(entities) == 0) { +# return(data.table(doc_id = doc_id, +# entity = NA, +# tag = NA, +# text_id = ifelse(show.text_id, text, NA))) +# } +# +# # Unified data table creation process +# dt <- data.table( +# doc_id = rep(doc_id, length(entities)), +# entity = vapply(entities, function(e) e$text, character(1)), +# tag = vapply(entities, function(e) e$tag, character(1)) +# ) +# +# if (isTRUE(show.text_id)) { +# dt[, text_id := text] +# } +# +# return(dt) +# }) +# +# return(rbindlist(results_list, fill = TRUE)) # Return the combined result for this batch +# } +# +# # Batch processing +# num_batches <- ceiling(length(texts) / batch_size) +# batched_results <- lapply(1:num_batches, function(b) { +# # Print the progress if verbose is TRUE +# if (isTRUE(verbose)) { +# cat(sprintf("Processing batch %d out of %d...\n", b, num_batches)) +# } +# +# start_idx <- (b - 1) * batch_size + 1 +# end_idx <- min(b * batch_size, length(texts)) +# return(process_texts_batch(texts[start_idx:end_idx], doc_ids[start_idx:end_idx])) +# }) +# +# # Activate garbage collection +# check_and_gc(gc.active) +# +# return(rbindlist(batched_results, fill = TRUE)) +# } diff --git a/R/get_pos.R b/R/get_pos.R index fff7ff01..b763a033 100644 --- a/R/get_pos.R +++ b/R/get_pos.R @@ -85,7 +85,7 @@ get_pos <- function(texts, doc_ids = NULL, tagger = NULL, language = NULL, text_id = ifelse(show.text_id, text, NA), token = NA, tag = NA, - precision = NA)) + score = NA)) } sentence <- Sentence(texts[[i]]) tagger$predict(sentence) @@ -99,7 +99,7 @@ get_pos <- function(texts, doc_ids = NULL, tagger = NULL, language = NULL, text_id = ifelse(show.text_id, text, NA), token = NA, tag = NA, - precision = NA)) + score = NA)) } else { return(data.table( doc_id = rep(doc_ids[[i]], length(tag_list)), @@ -107,7 +107,7 @@ get_pos <- function(texts, doc_ids = NULL, tagger = NULL, language = NULL, text_id = ifelse(show.text_id, text, NA), token = vapply(tag_list, function(x) gsub('^Token\\[\\d+\\]: "(.*)" .*', '\\1', x), character(1)), tag = vapply(tag_list, function(x) gsub('^Token\\[\\d+\\]: ".*" \u2192 (.*) \\(.*\\)', '\\1', x), character(1)), - precision = as.numeric(vapply(tag_list, function(x) gsub(".*\\((.*)\\)", "\\1", x), character(1))) + score = as.numeric(vapply(tag_list, function(x) gsub(".*\\((.*)\\)", "\\1", x), character(1))) )) } } diff --git a/R/get_pos_batch.R b/R/get_pos_batch.R index e093468a..03df53aa 100644 --- a/R/get_pos_batch.R +++ b/R/get_pos_batch.R @@ -1,137 +1,137 @@ -#' @title Batch Process of Part-of-Speech Tagging +#' #' @title Batch Process of Part-of-Speech Tagging +#' #' +#' #' @description This function returns a data table of POS tags and other related +#' #' data for the given texts using batch processing. +#' #' +#' #' @param texts A character vector containing texts to be processed. +#' #' @param doc_ids A character vector containing document ids. +#' #' @param tagger A tagger object (default is NULL). +#' #' @param language The language of the texts (default is NULL). +#' #' @param show.text_id A logical value. If TRUE, includes the actual text from +#' #' which the entity was extracted in the resulting data table. Useful for +#' #' verification and traceability purposes but might increase the size of +#' #' the output. Default is FALSE. +#' #' @param gc.active A logical value. If TRUE, runs the garbage collector after +#' #' processing all texts. This can help in freeing up memory by releasing unused +#' #' memory space, especially when processing a large number of texts. +#' #' Default is FALSE. +#' #' @param batch_size An integer specifying the size of each batch. Default is 5. +#' #' @param device A character string specifying the computation device. +#' #' \itemize{ +#' #' \item{"cuda" or "cuda:0" ("mps" or "mps:0" in Mac M1/M2 )}{Refers to the first GPU in the system. If +#' #' there's only one GPU, specifying "cuda" or "cuda:0" will allocate +#' #' computations to this GPU.} +#' #' \item{"cuda:1" ("mps:1")}{Refers to the second GPU in the system, allowing allocation +#' #' of specific computations to this GPU.} +#' #' \item{"cuda:2" ("mps:2)}{Refers to the third GPU in the system, and so on for systems +#' #' with more GPUs.} +#' #' } +#' #' +#' #' @param verbose A logical value. If TRUE, the function prints batch processing +#' #' progress updates. Default is TRUE. +#' #' +#' #' @return A data.table containing the following columns: +#' #' \describe{ +#' #' \item{`doc_id`}{The document identifier corresponding to each text.} +#' #' \item{`token_id`}{The token number in the original text, +#' #' indicating the position of the token.} +#' #' \item{`text_id`}{The actual text input passed to the function (if show.text_id is TRUE).} +#' #' \item{`token`}{The individual word or token from the text that was +#' #' POS tagged.} +#' #' \item{`tag`}{The part-of-speech tag assigned to the token by +#' #' the Flair library.} +#' #' \item{`precision`}{A confidence score (numeric) for the +#' #' assigned POS tag.} +#' #' } +#' #' +#' #' @import reticulate +#' #' @export +#' #' +#' #' @examples +#' #' \dontrun{ +#' #' library(reticulate) +#' #' library(fliaR) +#' #' tagger_pos_fast <- load_tagger_pos('pos-fast') +#' #' texts <- c("UCD is one of the best universities in Ireland.", +#' #' "Essex is not in the Russell Group, but it is famous for political science research.", +#' #' "TCD is the oldest university in Ireland.") +#' #' doc_ids <- c("doc1", "doc2", "doc3") +#' #' +#' #' # Using the batch_size parameter +#' #' get_pos_batch(texts, doc_ids, tagger_pos_fast, batch_size = 2) +#' #' } #' -#' @description This function returns a data table of POS tags and other related -#' data for the given texts using batch processing. +#' get_pos_batch <- function(texts, doc_ids, tagger = NULL, language = NULL, +#' show.text_id = FALSE, gc.active = FALSE, +#' batch_size = 5, device = "cpu", verbose = TRUE) { #' -#' @param texts A character vector containing texts to be processed. -#' @param doc_ids A character vector containing document ids. -#' @param tagger A tagger object (default is NULL). -#' @param language The language of the texts (default is NULL). -#' @param show.text_id A logical value. If TRUE, includes the actual text from -#' which the entity was extracted in the resulting data table. Useful for -#' verification and traceability purposes but might increase the size of -#' the output. Default is FALSE. -#' @param gc.active A logical value. If TRUE, runs the garbage collector after -#' processing all texts. This can help in freeing up memory by releasing unused -#' memory space, especially when processing a large number of texts. -#' Default is FALSE. -#' @param batch_size An integer specifying the size of each batch. Default is 5. -#' @param device A character string specifying the computation device. -#' \itemize{ -#' \item{"cuda" or "cuda:0" ("mps" or "mps:0" in Mac M1/M2 )}{Refers to the first GPU in the system. If -#' there's only one GPU, specifying "cuda" or "cuda:0" will allocate -#' computations to this GPU.} -#' \item{"cuda:1" ("mps:1")}{Refers to the second GPU in the system, allowing allocation -#' of specific computations to this GPU.} -#' \item{"cuda:2" ("mps:2)}{Refers to the third GPU in the system, and so on for systems -#' with more GPUs.} -#' } +#' # Check environment pre-requisites and parameters +#' check_prerequisites() +#' check_texts_and_ids(texts, doc_ids) +#' check_show.text_id(show.text_id) +#' check_device(device) +#' check_batch_size(batch_size) #' -#' @param verbose A logical value. If TRUE, the function prints batch processing -#' progress updates. Default is TRUE. +#' # Import the `Sentence` tokenizer and `Classifier` from Python's Flair +#' flair <- reticulate::import("flair") +#' Sentence <- flair$data$Sentence #' -#' @return A data.table containing the following columns: -#' \describe{ -#' \item{`doc_id`}{The document identifier corresponding to each text.} -#' \item{`token_id`}{The token number in the original text, -#' indicating the position of the token.} -#' \item{`text_id`}{The actual text input passed to the function (if show.text_id is TRUE).} -#' \item{`token`}{The individual word or token from the text that was -#' POS tagged.} -#' \item{`tag`}{The part-of-speech tag assigned to the token by -#' the Flair library.} -#' \item{`precision`}{A confidence score (numeric) for the -#' assigned POS tag.} -#' } +#' # Load tagger if null +#' if (is.null(tagger)) { +#' tagger <- load_tagger_pos(language) +#' } +#' +#' # Function to process a single sentence +#' process_single_sentence <- function(sentence, doc_id) { +#' text <- sentence$text +#' tag_list <- sentence$labels +#' +#' # Check if there are no pos tag in tag_list if tag_list empty returns NAs +#' if (length(tag_list) == 0) { +#' return(data.table(doc_id = doc_id, +#' token_id = NA, +#' text_id = ifelse(show.text_id, text, NA), +#' token = NA, +#' tag = NA, +#' precision = NA)) +#' } else { +#' return(data.table( +#' doc_id = rep(doc_id, length(tag_list)), +#' token_id = as.numeric(vapply(tag_list, function(x) gsub("^Token\\[([0-9]+)\\].*$", "\\1", x), character(1))), +#' text_id = ifelse(show.text_id, text, NA), +#' token = vapply(tag_list, function(x) gsub('^Token\\[\\d+\\]: "(.*)" .*', '\\1', x), character(1)), +#' tag = vapply(tag_list, function(x) gsub('^Token\\[\\d+\\]: ".*" \u2192 (.*) \\(.*\\)', '\\1', x), character(1)), +#' precision = as.numeric(vapply(tag_list, function(x) gsub(".*\\((.*)\\)", "\\1", x), character(1))) +#' )) +#' } +#' } +#' +#' process_batch <- function(start_idx) { +#' if (verbose) { +#' cat(paste("Processing batch starting at index:", start_idx, "\n")) +#' } +#' +#' batch_texts <- texts[start_idx:(start_idx+batch_size-1)] +#' batch_ids <- doc_ids[start_idx:(start_idx+batch_size-1)] +#' batch_sentences <- lapply(batch_texts, Sentence) +#' lapply(batch_sentences, tagger$predict) +#' +#' dt_list <- lapply(seq_along(batch_sentences), function(i) { +#' process_single_sentence(batch_sentences[[i]], batch_ids[[i]]) +#' }) +#' +#' return(rbindlist(dt_list, fill = TRUE)) # Bind the results within this batch together +#' } +#' +#' # Split the data into batches and process each batch +#' n <- length(texts) +#' idxs <- seq(1, n, by = batch_size) +#' results_list <- lapply(idxs, process_batch) #' -#' @import reticulate -#' @export +#' # Combine the results from all batches +#' results_dt <- rbindlist(results_list, fill = TRUE) #' -#' @examples -#' \dontrun{ -#' library(reticulate) -#' library(fliaR) -#' tagger_pos_fast <- load_tagger_pos('pos-fast') -#' texts <- c("UCD is one of the best universities in Ireland.", -#' "Essex is not in the Russell Group, but it is famous for political science research.", -#' "TCD is the oldest university in Ireland.") -#' doc_ids <- c("doc1", "doc2", "doc3") +#' check_and_gc(gc.active) #' -#' # Using the batch_size parameter -#' get_pos_batch(texts, doc_ids, tagger_pos_fast, batch_size = 2) +#' return(results_dt) #' } - -get_pos_batch <- function(texts, doc_ids, tagger = NULL, language = NULL, - show.text_id = FALSE, gc.active = FALSE, - batch_size = 5, device = "cpu", verbose = TRUE) { - - # Check environment pre-requisites and parameters - check_prerequisites() - check_texts_and_ids(texts, doc_ids) - check_show.text_id(show.text_id) - check_device(device) - check_batch_size(batch_size) - - # Import the `Sentence` tokenizer and `Classifier` from Python's Flair - flair <- reticulate::import("flair") - Sentence <- flair$data$Sentence - - # Load tagger if null - if (is.null(tagger)) { - tagger <- load_tagger_pos(language) - } - - # Function to process a single sentence - process_single_sentence <- function(sentence, doc_id) { - text <- sentence$text - tag_list <- sentence$labels - - # Check if there are no pos tag in tag_list if tag_list empty returns NAs - if (length(tag_list) == 0) { - return(data.table(doc_id = doc_id, - token_id = NA, - text_id = ifelse(show.text_id, text, NA), - token = NA, - tag = NA, - precision = NA)) - } else { - return(data.table( - doc_id = rep(doc_id, length(tag_list)), - token_id = as.numeric(vapply(tag_list, function(x) gsub("^Token\\[([0-9]+)\\].*$", "\\1", x), character(1))), - text_id = ifelse(show.text_id, text, NA), - token = vapply(tag_list, function(x) gsub('^Token\\[\\d+\\]: "(.*)" .*', '\\1', x), character(1)), - tag = vapply(tag_list, function(x) gsub('^Token\\[\\d+\\]: ".*" \u2192 (.*) \\(.*\\)', '\\1', x), character(1)), - precision = as.numeric(vapply(tag_list, function(x) gsub(".*\\((.*)\\)", "\\1", x), character(1))) - )) - } - } - - process_batch <- function(start_idx) { - if (verbose) { - cat(paste("Processing batch starting at index:", start_idx, "\n")) - } - - batch_texts <- texts[start_idx:(start_idx+batch_size-1)] - batch_ids <- doc_ids[start_idx:(start_idx+batch_size-1)] - batch_sentences <- lapply(batch_texts, Sentence) - lapply(batch_sentences, tagger$predict) - - dt_list <- lapply(seq_along(batch_sentences), function(i) { - process_single_sentence(batch_sentences[[i]], batch_ids[[i]]) - }) - - return(rbindlist(dt_list, fill = TRUE)) # Bind the results within this batch together - } - - # Split the data into batches and process each batch - n <- length(texts) - idxs <- seq(1, n, by = batch_size) - results_list <- lapply(idxs, process_batch) - - # Combine the results from all batches - results_dt <- rbindlist(results_list, fill = TRUE) - - check_and_gc(gc.active) - - return(results_dt) -} diff --git a/R/get_sentiments.R b/R/get_sentiments.R index 4c156839..85184a94 100644 --- a/R/get_sentiments.R +++ b/R/get_sentiments.R @@ -1,115 +1,115 @@ -#' @title Tagging Sentiment with Flair Standard Models +#' #' @title Tagging Sentiment with Flair Standard Models +#' #' +#' #' @description This function takes in texts and their associated document IDs +#' #' to predict sentiments using the flair Python library. +#' #' +#' #' @param texts A list or vector of texts for which sentiment prediction is +#' #' to be made. +#' #' @param doc_ids A list or vector of document IDs corresponding to the texts. +#' #' @param language A character string indicating the language of the texts. +#' #' Currently supports "sentiment" (English), "sentiment-fast" (English), and +#' #' "de-offensive-language" (German) +#' #' @param tagger An optional flair sentiment model. If NULL (default), +#' #' the function loads the default model based on the language. +#' #' @param ... Additional arguments passed to next. +#' #' @param show.text_id A logical value. If TRUE, includes the actual text from +#' #' which the sentiment was predicted. Default is FALSE. +#' #' @param gc.active A logical value. If TRUE, runs the garbage collector after +#' #' processing all texts. This can help in freeing up memory by releasing unused +#' #' memory space, especially when processing a large number of texts. +#' #' Default is FALSE. +#' #' +#' #' @return A `data.table` containing three columns: +#' #' \itemize{ +#' #' \item `doc_id`: The document ID from the input. +#' #' \item `sentiment`: Predicted sentiment for the text. +#' #' \item `score`: Score for the sentiment prediction. +#' #' } +#' #' +#' #' @examples +#' #' \dontrun{ +#' #' library(flaiR) +#' #' texts <- c("UCD is one of the best universities in Ireland.", +#' #' "UCD has a good campus but is very far from my apartment in Dublin.", +#' #' "Essex is famous for social science research.", +#' #' "Essex is not in the Russell Group, but it is famous for political science research.", +#' #' "TCD is the oldest university in Ireland.", +#' #' "TCD is similar to Oxford.") +#' #' +#' #' doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") +#' #' +#' #' # Load re-trained sentiment ("sentiment") model +#' #' tagger_sent <- load_tagger_sentiments('sentiment') +#' #' +#' #' results <- get_sentiments(texts, doc_ids, tagger_sent) +#' #' print(results) +#' #' } +#' #' +#' #' @importFrom data.table data.table +#' #' @importFrom reticulate py_available py_module_available import +#' #' @importFrom data.table := +#' #' @export +#' get_sentiments <- function(texts, doc_ids, +#' tagger = NULL, +#' ..., +#' language = NULL, +#' show.text_id = FALSE, gc.active = FALSE ) { +#' # Check environment pre-requisites +#' check_prerequisites() +#' check_show.text_id(show.text_id) #' -#' @description This function takes in texts and their associated document IDs -#' to predict sentiments using the flair Python library. +#' # Check and prepare texts and doc_ids +#' texts_and_ids <- check_texts_and_ids(texts, doc_ids) +#' texts <- texts_and_ids$texts +#' doc_ids <- texts_and_ids$doc_ids #' -#' @param texts A list or vector of texts for which sentiment prediction is -#' to be made. -#' @param doc_ids A list or vector of document IDs corresponding to the texts. -#' @param language A character string indicating the language of the texts. -#' Currently supports "sentiment" (English), "sentiment-fast" (English), and -#' "de-offensive-language" (German) -#' @param tagger An optional flair sentiment model. If NULL (default), -#' the function loads the default model based on the language. -#' @param ... Additional arguments passed to next. -#' @param show.text_id A logical value. If TRUE, includes the actual text from -#' which the sentiment was predicted. Default is FALSE. -#' @param gc.active A logical value. If TRUE, runs the garbage collector after -#' processing all texts. This can help in freeing up memory by releasing unused -#' memory space, especially when processing a large number of texts. -#' Default is FALSE. +#' # Load the Sentence tokenizer from the Flair library in Python. +#' flair <- reticulate::import("flair") +#' Classifier <- flair$nn$Classifier +#' Sentence <- flair$data$Sentence #' -#' @return A `data.table` containing three columns: -#' \itemize{ -#' \item `doc_id`: The document ID from the input. -#' \item `sentiment`: Predicted sentiment for the text. -#' \item `score`: Score for the sentiment prediction. -#' } +#' # Load tagger if null +#' if (is.null(tagger)) { +#' tagger <- load_tagger_sentiments(language) +#' } +#' +#' # Function to process each text +#' process_text <- function(text, doc_id) { +#' text_id <- NULL +#' if (is.na(text)) { +#' return(data.table(doc_id = doc_id, text_id = NA, sentiment = NA, score = NA)) +#' } +#' +#' # Create a sentence using provided text +#' sentence <- Sentence(text) #' -#' @examples -#' \dontrun{ -#' library(flaiR) -#' texts <- c("UCD is one of the best universities in Ireland.", -#' "UCD has a good campus but is very far from my apartment in Dublin.", -#' "Essex is famous for social science research.", -#' "Essex is not in the Russell Group, but it is famous for political science research.", -#' "TCD is the oldest university in Ireland.", -#' "TCD is similar to Oxford.") +#' # Predict sentiment +#' tagger$predict(sentence) #' -#' doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") +#' # Check if there's a predicted label +#' if (length(sentence$get_labels()) > 0) { +#' sentiment_label <- sentence$get_labels()[[1]]$value +#' sentiment_score <- sentence$get_labels()[[1]]$score +#' } else { +#' sentiment_label <- NA +#' sentiment_score <- NA +#' } #' -#' # Load re-trained sentiment ("sentiment") model -#' tagger_sent <- load_tagger_sentiments('sentiment') +#' dt <- data.table(doc_id = doc_id, +#' sentiment = sentiment_label, +#' score = sentiment_score) #' -#' results <- get_sentiments(texts, doc_ids, tagger_sent) -#' print(results) -#' } +#' if (isTRUE(show.text_id)) { +#' dt[, text_id := text] +#' } #' -#' @importFrom data.table data.table -#' @importFrom reticulate py_available py_module_available import -#' @importFrom data.table := -#' @export -get_sentiments <- function(texts, doc_ids, - tagger = NULL, - ..., - language = NULL, - show.text_id = FALSE, gc.active = FALSE ) { - # Check environment pre-requisites - check_prerequisites() - check_show.text_id(show.text_id) - - # Check and prepare texts and doc_ids - texts_and_ids <- check_texts_and_ids(texts, doc_ids) - texts <- texts_and_ids$texts - doc_ids <- texts_and_ids$doc_ids - - # Load the Sentence tokenizer from the Flair library in Python. - flair <- reticulate::import("flair") - Classifier <- flair$nn$Classifier - Sentence <- flair$data$Sentence - - # Load tagger if null - if (is.null(tagger)) { - tagger <- load_tagger_sentiments(language) - } - - # Function to process each text - process_text <- function(text, doc_id) { - text_id <- NULL - if (is.na(text)) { - return(data.table(doc_id = doc_id, text_id = NA, sentiment = NA, score = NA)) - } - - # Create a sentence using provided text - sentence <- Sentence(text) - - # Predict sentiment - tagger$predict(sentence) - - # Check if there's a predicted label - if (length(sentence$get_labels()) > 0) { - sentiment_label <- sentence$get_labels()[[1]]$value - sentiment_score <- sentence$get_labels()[[1]]$score - } else { - sentiment_label <- NA - sentiment_score <- NA - } - - dt <- data.table(doc_id = doc_id, - sentiment = sentiment_label, - score = sentiment_score) - - if (isTRUE(show.text_id)) { - dt[, text_id := text] - } - - return(dt) - } - - results_list <- lapply(seq_along(texts), function(i) process_text(texts[i], doc_ids[i])) - results_dt <- rbindlist(results_list, fill=TRUE) - - # Activate garbage collection - check_and_gc(gc.active) - return(results_dt) - } +#' return(dt) +#' } +#' +#' results_list <- lapply(seq_along(texts), function(i) process_text(texts[i], doc_ids[i])) +#' results_dt <- rbindlist(results_list, fill=TRUE) +#' +#' # Activate garbage collection +#' check_and_gc(gc.active) +#' return(results_dt) +#' } diff --git a/R/get_sentiments_batch.R b/R/get_sentiments_batch.R index 6d9dca08..cc2cd619 100644 --- a/R/get_sentiments_batch.R +++ b/R/get_sentiments_batch.R @@ -1,146 +1,146 @@ -#' @title Batch Process of Tagging Sentiment with Flair Models +#' #' @title Batch Process of Tagging Sentiment with Flair Models +#' #' +#' #' @description This function takes in texts and their associated document IDs +#' #' to predict sentiments using the flair Python library. +#' #' +#' #' @param texts A list or vector of texts for which sentiment prediction is +#' #' to be made. +#' #' @param doc_ids A list or vector of document IDs corresponding to the texts. +#' #' @param language A character string indicating the language of the texts. +#' #' Currently supports "sentiment" (English), "sentiment-fast" (English), and +#' #' "de-offensive-language" (German) +#' #' @param tagger An optional flair sentiment model. If NULL (default), +#' #' the function loads the default model based on the language. +#' #' @param ... Additional arguments passed to next. +#' #' @param show.text_id A logical value. If TRUE, includes the actual text from +#' #' which the sentiment was predicted. Default is FALSE. +#' #' @param gc.active A logical value. If TRUE, runs the garbage collector after +#' #' processing all texts. This can help in freeing up memory by releasing unused +#' #' memory space, especially when processing a large number of texts. +#' #' Default is FALSE. +#' #' @param batch_size An integer specifying the number of texts to be processed +#' #' at once. It can help optimize performance by leveraging parallel processing. +#' #' Default is 5. +#' #' @param device A character string specifying the computation device. +#' #' It can be either "cpu" or a string representation of a GPU device number. +#' #' For instance, "0" corresponds to the first GPU. If a GPU device number +#' #' is provided, it will attempt to use that GPU. The default is "cpu". +#' #' \itemize{ +#' #' \item{"cuda" or "cuda:0" ("mps" or "mps:0" in Mac M1/M2 )}{Refers to the first GPU in the system. If +#' #' there's only one GPU, specifying "cuda" or "cuda:0" will allocate +#' #' computations to this GPU.} +#' #' \item{"cuda:1" ("mps:1")}{Refers to the second GPU in the system, allowing allocation +#' #' of specific computations to this GPU.} +#' #' \item{"cuda:2" ("mps:2)}{Refers to the third GPU in the system, and so on for systems +#' #' with more GPUs.} +#' #' } +#' #' +#' #' @param verbose A logical value. If TRUE, the function prints batch processing +#' #' progress updates. Default is TRUE. +#' #' +#' #' @return A `data.table` containing three columns: +#' #' \itemize{ +#' #' \item `doc_id`: The document ID from the input. +#' #' \item `sentiment`: Predicted sentiment for the text. +#' #' \item `score`: Score for the sentiment prediction. +#' #' } +#' #' +#' #' @examples +#' #' \dontrun{ +#' #' library(flaiR) +#' #' +#' #' +#' #' texts <- c("UCD is one of the best universities in Ireland.", +#' #' "UCD has a good campus but is very far from my apartment in Dublin.", +#' #' "Essex is famous for social science research.", +#' #' "Essex is not in the Russell Group, but it is famous for political science research.", +#' #' "TCD is the oldest university in Ireland.", +#' #' "TCD is similar to Oxford.") +#' #' +#' #' doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") +#' #' +#' #' # Load re-trained sentiment ("sentiment") model +#' #' tagger_sent <- load_tagger_sentiments('sentiment') +#' #' +#' #' results <- get_sentiments_batch(texts, doc_ids, tagger_sent, batch_size = 3) +#' #' print(results) +#' #' } +#' #' +#' #' @importFrom data.table data.table +#' #' @importFrom reticulate py_available py_module_available import +#' #' @importFrom data.table := +#' #' @export +#' get_sentiments_batch <- function(texts, doc_ids, +#' tagger = NULL, ..., language = NULL, +#' show.text_id = FALSE, gc.active = FALSE, +#' batch_size = 5, device = "cpu", verbose = FALSE) { +#' # Check environment pre-requisites and parameters +#' check_prerequisites() +#' check_device(device) +#' check_batch_size(batch_size) +#' check_texts_and_ids(texts, doc_ids) +#' check_show.text_id(show.text_id) #' -#' @description This function takes in texts and their associated document IDs -#' to predict sentiments using the flair Python library. +#' # Load the Sentence tokenizer from the Flair library in Python. +#' flair <- reticulate::import("flair") +#' Classifier <- flair$nn$Classifier +#' Sentence <- flair$data$Sentence #' -#' @param texts A list or vector of texts for which sentiment prediction is -#' to be made. -#' @param doc_ids A list or vector of document IDs corresponding to the texts. -#' @param language A character string indicating the language of the texts. -#' Currently supports "sentiment" (English), "sentiment-fast" (English), and -#' "de-offensive-language" (German) -#' @param tagger An optional flair sentiment model. If NULL (default), -#' the function loads the default model based on the language. -#' @param ... Additional arguments passed to next. -#' @param show.text_id A logical value. If TRUE, includes the actual text from -#' which the sentiment was predicted. Default is FALSE. -#' @param gc.active A logical value. If TRUE, runs the garbage collector after -#' processing all texts. This can help in freeing up memory by releasing unused -#' memory space, especially when processing a large number of texts. -#' Default is FALSE. -#' @param batch_size An integer specifying the number of texts to be processed -#' at once. It can help optimize performance by leveraging parallel processing. -#' Default is 5. -#' @param device A character string specifying the computation device. -#' It can be either "cpu" or a string representation of a GPU device number. -#' For instance, "0" corresponds to the first GPU. If a GPU device number -#' is provided, it will attempt to use that GPU. The default is "cpu". -#' \itemize{ -#' \item{"cuda" or "cuda:0" ("mps" or "mps:0" in Mac M1/M2 )}{Refers to the first GPU in the system. If -#' there's only one GPU, specifying "cuda" or "cuda:0" will allocate -#' computations to this GPU.} -#' \item{"cuda:1" ("mps:1")}{Refers to the second GPU in the system, allowing allocation -#' of specific computations to this GPU.} -#' \item{"cuda:2" ("mps:2)}{Refers to the third GPU in the system, and so on for systems -#' with more GPUs.} -#' } +#' # Load tagger if null +#' if (is.null(tagger)) { +#' tagger <- load_tagger_sentiments(language) +#' } +#' +#' # `process_batch` to batch process +#' process_batch <- function(texts_batch, doc_ids_batch) { +#' text_id <- NULL +#' sentences <- lapply(texts_batch, flair$data$Sentence) +#' +#' # Predict sentiments for the entire batch +#' tagger$predict(sentences) +#' +#' results <- lapply(seq_along(sentences), function(i) { +#' sentence <- sentences[[i]] +#' doc_id <- doc_ids_batch[i] +#' +#' if (length(sentence$get_labels()) > 0) { +#' sentiment_label <- sentence$get_labels()[[1]]$value +#' sentiment_score <- sentence$get_labels()[[1]]$score +#' } else { +#' sentiment_label <- NA +#' sentiment_score <- NA +#' } #' -#' @param verbose A logical value. If TRUE, the function prints batch processing -#' progress updates. Default is TRUE. +#' dt <- data.table(doc_id = doc_id, +#' sentiment = sentiment_label, +#' score = sentiment_score) +#' if (isTRUE(show.text_id)) { +#' dt[, text_id := texts_batch[i]] +#' } +#' dt +#' }) #' -#' @return A `data.table` containing three columns: -#' \itemize{ -#' \item `doc_id`: The document ID from the input. -#' \item `sentiment`: Predicted sentiment for the text. -#' \item `score`: Score for the sentiment prediction. +#' return(rbindlist(results, fill = TRUE)) #' } #' -#' @examples -#' \dontrun{ -#' library(flaiR) +#' # Split texts into batches and process +#' num_batches <- ceiling(length(texts) / batch_size) +#' results_list <- lapply(1:num_batches, function(i) { +#' start_idx <- (i-1)*batch_size + 1 +#' end_idx <- min(i*batch_size, length(texts)) #' +#' if (isTRUE(verbose)) { +#' cat(sprintf("Processing batch %d out of %d...\n", i, num_batches)) +#' } #' -#' texts <- c("UCD is one of the best universities in Ireland.", -#' "UCD has a good campus but is very far from my apartment in Dublin.", -#' "Essex is famous for social science research.", -#' "Essex is not in the Russell Group, but it is famous for political science research.", -#' "TCD is the oldest university in Ireland.", -#' "TCD is similar to Oxford.") +#' process_batch(texts[start_idx:end_idx], doc_ids[start_idx:end_idx]) +#' }) #' -#' doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") +#' results_dt <- rbindlist(results_list, fill = TRUE) #' -#' # Load re-trained sentiment ("sentiment") model -#' tagger_sent <- load_tagger_sentiments('sentiment') +#' # Activate garbage collection +#' check_and_gc(gc.active) #' -#' results <- get_sentiments_batch(texts, doc_ids, tagger_sent, batch_size = 3) -#' print(results) +#' return(results_dt) #' } -#' -#' @importFrom data.table data.table -#' @importFrom reticulate py_available py_module_available import -#' @importFrom data.table := -#' @export -get_sentiments_batch <- function(texts, doc_ids, - tagger = NULL, ..., language = NULL, - show.text_id = FALSE, gc.active = FALSE, - batch_size = 5, device = "cpu", verbose = FALSE) { - # Check environment pre-requisites and parameters - check_prerequisites() - check_device(device) - check_batch_size(batch_size) - check_texts_and_ids(texts, doc_ids) - check_show.text_id(show.text_id) - - # Load the Sentence tokenizer from the Flair library in Python. - flair <- reticulate::import("flair") - Classifier <- flair$nn$Classifier - Sentence <- flair$data$Sentence - - # Load tagger if null - if (is.null(tagger)) { - tagger <- load_tagger_sentiments(language) - } - - # `process_batch` to batch process - process_batch <- function(texts_batch, doc_ids_batch) { - text_id <- NULL - sentences <- lapply(texts_batch, flair$data$Sentence) - - # Predict sentiments for the entire batch - tagger$predict(sentences) - - results <- lapply(seq_along(sentences), function(i) { - sentence <- sentences[[i]] - doc_id <- doc_ids_batch[i] - - if (length(sentence$get_labels()) > 0) { - sentiment_label <- sentence$get_labels()[[1]]$value - sentiment_score <- sentence$get_labels()[[1]]$score - } else { - sentiment_label <- NA - sentiment_score <- NA - } - - dt <- data.table(doc_id = doc_id, - sentiment = sentiment_label, - score = sentiment_score) - if (isTRUE(show.text_id)) { - dt[, text_id := texts_batch[i]] - } - dt - }) - - return(rbindlist(results, fill = TRUE)) - } - - # Split texts into batches and process - num_batches <- ceiling(length(texts) / batch_size) - results_list <- lapply(1:num_batches, function(i) { - start_idx <- (i-1)*batch_size + 1 - end_idx <- min(i*batch_size, length(texts)) - - if (isTRUE(verbose)) { - cat(sprintf("Processing batch %d out of %d...\n", i, num_batches)) - } - - process_batch(texts[start_idx:end_idx], doc_ids[start_idx:end_idx]) - }) - - results_dt <- rbindlist(results_list, fill = TRUE) - - # Activate garbage collection - check_and_gc(gc.active) - - return(results_dt) -} diff --git a/R/note.R b/R/note.R index af3bb6e7..9c18c0d1 100644 --- a/R/note.R +++ b/R/note.R @@ -150,3 +150,357 @@ # packageStartupMessage("Flair NLP can be successfully imported in R via {flaiR} ! \U1F44F") # } # } + + + +#' #' @title Load the Named Entity Recognition (NER) Tagger +#' #' +#' #' @description A helper function to load the appropriate tagger based on the provided language. +#' #' This function supports a variety of languages/models. +#' #' +#' #' @param language A character string indicating the desired language for the NER tagger. +#' #' If `NULL`, the function will default to the 'pos-fast' model. +#' #' Supported languages and their models include: +#' #' \itemize{ +#' #' \item `"en"` - English NER tagging (`ner`) +#' #' \item `"de"` - German NER tagging (`de-ner`) +#' #' \item `"fr"` - French NER tagging (`fr-ner`) +#' #' \item `"nl"` - Dutch NER tagging (`nl-ner`) +#' #' \item `"da"` - Danish NER tagging (`da-ner`) +#' #' \item `"ar"` - Arabic NER tagging (`ar-ner`) +#' #' \item `"ner-fast"` - English NER fast model (`ner-fast`) +#' #' \item `"ner-large"` - English NER large mode (`ner-large`) +#' #' \item `"de-ner-legal"` - NER (legal text) (`de-ner-legal`) +#' #' \item `"nl"` - Dutch NER tagging (`nl-ner`) +#' #' \item `"da"` - Danish NER tagging (`da-ner`) +#' #' \item `"ar"` - Arabic NER tagging (`ar-ner`) +#' #'} +#' #' +#' #' @return An instance of the Flair SequenceTagger for the specified language. +#' #' +#' #' @import reticulate +#' #' @importFrom stats setNames +#' #' +#' #' @examples +#' #' # Load the English NER tagger +#' #' tagger_en <- load_tagger_ner("en") +#' #' +#' #' @export +#' load_tagger_ner <- function(language = NULL) { +#' supported_lan_models <- c("ner", "de-ner", +#' "fr-ner", "nl-ner", +#' "da-ner", "ar-ner", +#' "ner-fast", "ner-large", +#' "ner-pooled", "ner-ontonotes", +#' "ner-ontonotes-fast", "ner-ontonotes-large", +#' "de-ner-large", "de-ner-germeval", +#' "de-ner-legal", "es-ner", +#' "nl-ner", "nl-ner-large", +#' "nl-ner-rnn", "ner-ukrainian") +#' language_model_map <- setNames(supported_lan_models, c("en", "de", +#' "fr", "nl", +#' "da", "ar", +#' "ner-fast", "ner-large", +#' "ner-pooled", "ner-ontonotes", +#' "ner-ontonotes-fast", "ner-ontonotes-large", +#' "de-ner-large", "de-ner-germeval", +#' "de-ner-legal", "es-ner-large", +#' "nl-ner", "nl-ner-large", +#' "nl-ner-rnn", "ner-ukrainian") +#' ) +#' +#' if (is.null(language)) { +#' language <- "en" +#' message("Language is not specified. ", language, " in Flair is forceloaded. Please ensure that the internet connectivity is stable.") +#' } +#' +#' # Translate language to model name if necessary +#' if (language %in% names(language_model_map)) { +#' language <- language_model_map[[language]] +#' } +#' +#' # Ensure the model is supported +#' check_language_supported(language = language, supported_lan_models = supported_lan_models) +#' +#' # Load the model +#' SequenceTagger <- reticulate::import("flair.models")$SequenceTagger +#' SequenceTagger$load(language) +#' } +#' +#' +#' + + + +#' #' @title Tagging Named Entities with Flair Models +#' #' +#' #' @description This function takes texts and their corresponding document IDs +#' #' as inputs, uses the Flair NLP library to extract named entities, +#' #' and returns a dataframe of the identified entities along with their tags. +#' #' When no entities are detected in a text, the function returns a data table +#' #' with NA values. This might clutter the results. Depending on your use case, +#' #' you might decide to either keep this behavior or skip rows with no detected +#' #' entities. +#' #' +#' #' @param texts A character vector containing the texts to process. +#' #' @param doc_ids A character or numeric vector containing the document IDs +#' #' corresponding to each text. +#' #' @param tagger An optional tagger object. If NULL (default), the function will +#' #' load a Flair tagger based on the specified language. +#' #' @param language A character string indicating the language model to load. +#' #' Default is "en". +#' #' @param show.text_id A logical value. If TRUE, includes the actual text from +#' #' which the entity was extracted in the resulting data table. Useful for +#' #' verification and traceability purposes but might increase the size of +#' #' the output. Default is FALSE. +#' #' @param gc.active A logical value. If TRUE, runs the garbage collector after +#' #' processing all texts. This can help in freeing up memory by releasing unused +#' #' memory space, especially when processing a large number of texts. +#' #' Default is FALSE. +#' #' @return A data table with columns: +#' #' \describe{ +#' #' \item{doc_id}{The ID of the document from which the entity was extracted.} +#' #' \item{text_id}{If TRUE, the actual text from which the entity +#' #' was extracted.} +#' #' \item{entity}{The named entity that was extracted from the text.} +#' #' \item{tag}{The tag or category of the named entity. Common tags include: +#' #' PERSON (names of individuals), +#' #' ORG (organizations, institutions), +#' #' GPE (countries, cities, states), +#' #' LOCATION (mountain ranges, bodies of water), +#' #' DATE (dates or periods), +#' #' TIME (times of day), +#' #' MONEY (monetary values), +#' #' PERCENT (percentage values), +#' #' FACILITY (buildings, airports), +#' #' PRODUCT (objects, vehicles), +#' #' EVENT (named events like wars or sports events), +#' #' ART (titles of books)}} +#' #' @examples +#' #' \dontrun{ +#' #' library(reticulate) +#' #' library(fliaR) +#' #' +#' #' texts <- c("UCD is one of the best universities in Ireland.", +#' #' "UCD has a good campus but is very far from +#' #' my apartment in Dublin.", +#' #' "Essex is famous for social science research.", +#' #' "Essex is not in the Russell Group, but it is +#' #' famous for political science research.", +#' #' "TCD is the oldest university in Ireland.", +#' #' "TCD is similar to Oxford.") +#' #' doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") +#' #' # Load NER ("ner") model +#' #' tagger_ner <- load_tagger_ner('ner') +#' #' results <- get_entities(texts, doc_ids, tagger_ner) +#' #' print(results)} +#' #' +#' #' @importFrom data.table data.table rbindlist +#' #' @importFrom reticulate import +#' #' @importFrom data.table := +#' #' @export +# get_entities <- function(texts, doc_ids = NULL, tagger = NULL, language = NULL, +# show.text_id = FALSE, gc.active = FALSE) { +# +# # Check environment pre-requisites +# check_prerequisites() +# check_show.text_id(show.text_id) +# +# # Check and prepare texts and doc_ids +# texts_and_ids <- check_texts_and_ids(texts, doc_ids) +# texts <- texts_and_ids$texts +# doc_ids <- texts_and_ids$doc_ids +# +# # Load tagger if null +# if (is.null(tagger)) { +# tagger <- load_tagger_ner(language) +# } +# +# Sentence <- reticulate::import("flair")$data$Sentence +# +# # Process each text and extract entities +# process_text <- function(text, doc_id) { +# text_id <- NULL +# if (is.na(text) || is.na(doc_id)) { +# return(data.table(doc_id = NA, entity = NA, tag = NA)) +# } +# +# sentence <- Sentence(text) +# tagger$predict(sentence) +# entities <- sentence$get_spans("ner") +# +# if (length(entities) == 0) { +# return(data.table(doc_id = doc_id, entity = NA, tag = NA)) +# } +# +# # Unified data table creation process +# dt <- data.table( +# doc_id = rep(doc_id, length(entities)), +# entity = vapply(entities, function(e) e$text, character(1)), +# tag = vapply(entities, function(e) e$tag, character(1)) +# ) +# +# if (isTRUE(show.text_id)) { +# dt[, text_id := text] +# } +# +# return(dt) +# } +# # Activate garbage collection +# check_and_gc(gc.active) +# +# results_list <- lapply(seq_along(texts), +# function(i) {process_text(texts[[i]], doc_ids[[i]])}) +# rbindlist(results_list, fill = TRUE) +# } + + + +#' @title Load NER Tagger Model +#' +#' @description Loads a Named Entity Recognition (NER) model from Flair. +#' Verifies that the loaded model is a valid NER model. +#' +#' @param model_name Character string specifying the model name to load. +#' Default is "ner" (English NER model). +#' +#' @return A Flair SequenceTagger model object for NER. +#' @examples +#' \dontrun{ +#' # Load default English NER model +#' tagger <- load_tagger_ner() +#' +#' # Load specific model +#' tagger <- load_tagger_ner("ner-fast") +#' } +#' @export +# load_tagger_ner <- function(model_name = "ner") { +# if (is.null(model_name)) { +# model_name <- "ner" +# message("Model name is not specified. Using default 'ner' model.") +# } +# +# # Load the model +# tryCatch({ +# SequenceTagger <- flair_models()$SequenceTagger +# tagger <- SequenceTagger$load("flair/ner-english-large") +# +# # Verify the model is for NER +# # Check if the model has NER-related tags +# tags <- tagger$labels +# if (!any(grepl("PER|ORG|LOC|GPE|MISC", tags))) { +# warning("The loaded model may not be a proper NER model. ", +# "Please ensure it's designed for Named Entity Recognition.") +# } +# +# return(tagger) +# }, error = function(e) { +# stop("Error loading model: ", model_name, "\n", e$message) +# }) +# } +#' +#' #' @title Load and Configure NER Tagger +#' #' +#' #' @description Loads a Named Entity Recognition model from Flair and displays +#' #' its tag dictionary. +#' #' +#' #' @param model_name Character string specifying the model to load. +#' #' Default is "ner". +#' #' @param show_tags Logical, whether to display the tag dictionary. +#' #' Default is TRUE. +#' #' +#' #' @return A Flair SequenceTagger model object +#' #' @export +#' load_tagger_ner <- function(model_name = "ner", show_tags = TRUE) { +#' if (is.null(model_name)) { +#' model_name <- "ner" +#' message("Model name is not specified. Using default 'ner' model.") +#' } +#' +#' # Load the model +#' tryCatch({ +#' SequenceTagger <- flair_models()$SequenceTagger +#' tagger <- SequenceTagger$load("flair/ner-english-large") +#' +#' # Extract and organize tags if requested +#' if (show_tags) { +#' tag_dict <- tagger$label_dictionary +#' tag_list <- tag_dict$get_items() +#' +#' cat("\nNER Tagger Dictionary:\n") +#' cat("========================================\n") +#' cat(sprintf("Total tags: %d\n", length(tag_list))) +#' cat("----------------------------------------\n") +#' # Group and print tags by type +#' special_tags <- grep("^<.*>$|^O$", tag_list, value = TRUE) +#' person_tags <- grep("PER", tag_list, value = TRUE) +#' org_tags <- grep("ORG", tag_list, value = TRUE) +#' loc_tags <- grep("LOC", tag_list, value = TRUE) +#' misc_tags <- grep("MISC", tag_list, value = TRUE) +#' +#' if (length(special_tags) > 0) cat("Special:", paste(special_tags, collapse = ", "), "\n") +#' if (length(person_tags) > 0) cat("Person:", paste(person_tags, collapse = ", "), "\n") +#' if (length(org_tags) > 0) cat("Organization:", paste(org_tags, collapse = ", "), "\n") +#' if (length(loc_tags) > 0) cat("Location:", paste(loc_tags, collapse = ", "), "\n") +#' if (length(misc_tags) > 0) cat("Miscellaneous:", paste(misc_tags, collapse = ", "), "\n") +#' +#' cat("----------------------------------------\n") +#' cat("Tag scheme: BIOES\n") +#' cat("B-: Beginning of entity\n") +#' cat("I-: Inside of entity\n") +#' cat("O: Outside (not an entity)\n") +#' cat("E-: End of entity\n") +#' cat("S-: Single token entity\n") +#' cat("========================================\n") +#' } +#' +#' return(tagger) +#' }, error = function(e) { +#' stop("Error loading model: ", model_name, "\n", e$message) +#' }) +#' } +#' +#' + +#' +#' +#' #' @title Load a Sentiment or Language Tagger Model from Flair +#' #' +#' #' @description This function loads a pre-trained sentiment or language tagger +#' #' from the Flair library. +#' #' +#' #' @param language A character string specifying the language model to load. +#' #' Supported models include: +#' #' \itemize{ +#' #' \item "sentiment" - Sentiment analysis model +#' #' \item "sentiment-fast" - Faster sentiment analysis model +#' #' \item "de-offensive-language" - German offensive language detection model +#' #'} If not provided, the function will default to the "sentiment" model. +#' #' +#' #' @return An object of the loaded Flair model. +#' #' +#' #' @import reticulate +#' #' @examples +#' #' \dontrun{ +#' #' tagger <- load_tagger_sentiments("sentiment") +#' #' } +#' #' +#' #' @export +#' load_tagger_sentiments <- function(language = NULL) { +#' supported_lan_models <- c("sentiment", "sentiment-fast", "de-offensive-language") +#' +#' if (is.null(language)) { +#' language <- "sentiment" +#' message("Language is not specified. ", language, " in Flair is forceloaded. Please ensure that the internet connectivity is stable.") +#' } +#' +#' # Ensure the model is supported +#' check_language_supported(language = language, supported_lan_models = supported_lan_models) +#' +#' # Load the model +#' flair <- reticulate::import("flair") +#' Classifier <- flair$nn$Classifier +#' tagger <- Classifier$load(language) +#' return(tagger) +#' } + diff --git a/R/predict_label.R b/R/predict_label.R new file mode 100644 index 00000000..d28fe799 --- /dev/null +++ b/R/predict_label.R @@ -0,0 +1,96 @@ +#' Predict Text Label Using Flair Classifier +#' +#' @param text A character string containing the text to be labeled +#' @param classifier A Flair TextClassifier object for making predictions +#' @param sentence Optional Flair Sentence object. If NULL, one will be created from text +#' +#' @return A list containing: +#' \describe{ +#' \item{label}{Character string of predicted label} +#' \item{score}{Numeric confidence score from classifier} +#' \item{token_number}{Integer count of tokens in input text} +#' } +#' +#' @examples +#' \dontrun{ +#' # Example 1: Using text input +#' classifier <- flair_models()$TextClassifier$load('stance-classifier') +#' result1 <- predict_label( +#' text = "I strongly support this policy", +#' classifier = classifier +#' ) +#' +#' # Example 2: Using pre-created and tagged sentence +#' sent <- Sentence("I love Berlin and New York.") +#' tagger <- flair_models()$SequenceTagger$load('pos') +#' tagger$predict(sent) +#' print(sent) # Shows tokens with POS tags +#' +#' result2 <- predict_label( +#' text = NULL, +#' classifier = classifier, +#' sentence = sent +#' ) +#' } +#' +#' @import flaiR +#' @export +predict_label <- function(text, classifier, sentence = NULL) { + + # Check if classifier is valid + if (is.null(classifier) || !isTRUE(class(classifier)[1] == "flair.models.text_classification_model.TextClassifier")) { + stop("Invalid or missing classifier. Please provide a pre-trained Flair TextClassifier model.") + } + + # Check if Sentence exists and is correctly loaded + if (!("python.builtin.type" %in% class(Sentence))) { + stop("Sentence class not found or not properly loaded. Please ensure flaiR is properly loaded.") + } + + # Check if either text or sentence is provided + if (is.null(text) && is.null(sentence)) { + stop("Either text or sentence must be provided") + } + + # Create or validate sentence + if (is.null(sentence)) { + tryCatch({ + sentence <- Sentence(text) + }, error = function(e) { + stop("Failed to create Sentence object: ", e$message) + }) + } else { + # Enhanced sentence validation + if (!inherits(sentence, "flair.data.Sentence")) { + stop("Invalid sentence object. Must be a Flair Sentence instance.") + } + + if (!("tokens" %in% names(sentence)) || length(sentence$tokens) == 0) { + stop("Invalid sentence object: No tokens found.") + } + } + + # Use the classifier to predict + tryCatch({ + classifier$predict(sentence) + }, error = function(e) { + stop("Prediction failed: ", e$message) + }) + + # Verify prediction results + if (length(sentence$labels) == 0) { + stop("No prediction labels generated") + } + + # Get prediction details + predicted_label <- sentence$labels[[1]]$value + score <- sentence$labels[[1]]$score + token_number <- length(sentence$tokens) + + # Return results + return(list( + label = predicted_label, + score = score, + token_number = token_number + )) +} diff --git a/R/process_embeddings.R b/R/process_embeddings.R new file mode 100644 index 00000000..a64622d8 --- /dev/null +++ b/R/process_embeddings.R @@ -0,0 +1,144 @@ +#' Process Token Embeddings from Flair Sentence Object +#' +#' This function processes token embeddings from a Flair sentence object and converts them +#' into a matrix format with token names as row names. It handles the extraction of +#' embeddings from tokens, retrieval of token texts, and conversion to matrix format. +#' +#' @param sentence A Flair sentence object containing tokens with embeddings. +#' The sentence object should have a 'tokens' attribute, where each token +#' has both an 'embedding' (with numpy() method) and 'text' attribute. +#' @param verbose Logical indicating whether to print progress messages. Default is FALSE. +#' +#' @return A matrix where: +#' \itemize{ +#' \item Each row represents a token's embedding +#' \item Row names are the corresponding token texts +#' \item Columns represent the dimensions of the embedding vectors +#' } +#' +#' @details +#' The function will throw errors in the following cases: +#' \itemize{ +#' \item If sentence is NULL or has no tokens +#' \item If any token is missing an embedding +#' \item If any token is missing text +#' } +#' +#' @examples +#' \dontrun{ +#' # Create a Flair sentence +#' sentence <- Sentence("example text") +#' WordEmbeddings <- flair_embeddings()$WordEmbeddings +#' +#' # Initialize FastText embeddings trained on Common Crawl +#' fasttext_embeddings <- WordEmbeddings('en-crawl') +#' +#' # Apply embeddings +#' fasttext_embeddings$embed(sentence) +#' +#' # Process embeddings with timing and messages +#' embedding_matrix <- process_embeddings(sentence, verbose = TRUE) +#' } +#' +#' @import flaiR +#' @export +process_embeddings <- function(sentence, verbose = FALSE) { + # Start timing + start_time <- Sys.time() + + # Input validation + if (is.null(sentence) || is.null(sentence$tokens)) { + stop("Invalid input: sentence or tokens is NULL") + } + + # Check if embeddings are valid + if (!.has_valid_embeddings(sentence)) { + stop("Sentence needs to be embedded") + } + + # Extract and store embeddings for each token + if (verbose) message("Extracting token embeddings...") + sen_list <- list() + for (i in seq_along(sentence$tokens)) { + if (is.null(sentence$tokens[[i]]$embedding$numpy())) { + stop(sprintf("No embedding found for token at position %d", i)) + } + sen_list[[i]] <- as.vector(sentence$tokens[[i]]$embedding$numpy()) + } + + # Extract token texts for labeling + token_texts <- sapply(sentence$tokens, function(token) { + if (is.null(token$text)) { + stop("Token text is missing") + } + token$text + }) + + # Convert list of embeddings to matrix format + if (verbose) cat("Converting embeddings to matrix format...") + stacked_subset <- do.call(rbind, lapply(sen_list, function(x) { + matrix(x, nrow = 1) + })) + + # Add row names + rownames(stacked_subset) <- token_texts + + # Calculate processing time + end_time <- Sys.time() + processing_time <- round(as.numeric(difftime(end_time, start_time, units = "secs")), 3) + if (verbose) { + cat(sprintf("Processing completed in %s seconds\n", processing_time)) + cat(sprintf("Generated embedding matrix with %d tokens and %d dimensions\n", + nrow(stacked_subset), ncol(stacked_subset))) + } + + return(stacked_subset) +} + + +#' Internal function to check embedding validity +#' +#' This function verifies whether a Flair sentence object has been properly +#' embedded through the Flair framework by checking both the existence and +#' validity of embeddings. +#' +#' @param sentence A Flair sentence object to check +#' @return Logical indicating whether the sentence has valid embeddings +#' +#' @import flaiR +#' @noRd +.has_valid_embeddings <- function(sentence) { + # Input validation + if (is.null(sentence) || is.null(sentence$tokens)) { + return(FALSE) + } + + # Check if tokens exist and is not empty + if (length(sentence$tokens) == 0 || identical(sentence$tokens, numeric(0))) { + return(FALSE) + } + + # Safely get first token + first_token <- try(sentence$tokens[[1]], silent = TRUE) + if (inherits(first_token, "try-error") || + identical(first_token, numeric(0))) { + return(FALSE) + } + + # Check embedding structure exists + if (is.null(first_token$embedding)) { + return(FALSE) + } + + # Get numpy representation + embedding_array <- try(first_token$embedding$numpy(), silent = TRUE) + + # Check if embedding is valid (not numeric(0) and has values) + if (inherits(embedding_array, "try-error") || + identical(embedding_array, numeric(0)) || + length(embedding_array) == 0) { + return(FALSE) + } + + return(TRUE) +} diff --git a/README.Rmd b/README.Rmd index dbd3a613..2412ab50 100644 --- a/README.Rmd +++ b/README.Rmd @@ -25,7 +25,7 @@ knitr::opts_chunk$set( ```{r include=FALSE} library(reticulate) -library(flaiR) +# library(flaiR) reticulate::py_install("flair") system(paste(reticulate::py_config()$python, "-m pip install flair")) # options(repos = c(CRAN = "https://cloud.r-project.org/")) @@ -35,7 +35,7 @@ system(paste(reticulate::py_config()$python, "-m pip install flair"))
-__flaiR__ is an R package for accessing the [flairNLP/flair](flairNLP/flair) Python library, maintained by [Yen-ChiehLiao](https://davidycliao.github.io) ([University of Birmingham](https://www.birmingham.ac.uk/research/centres-institutes/centre-for-artificial-intelligence-in-government)) and [Stefan Müller](https://muellerstefan.net) from [Next Generation Energy Systems](https://www.nexsys-energy.ie) and [Text and Policy Research Group](https://text-and-policy.com) in UCD. flaiR provides convenient access to the main functionalities of flairNLP for training word embedding-based deep learning models and fine-tune state-of-the-art transformers hosted on Hugging Face. Our team trains and fine-tunes the models with Flair in [our projects](). +__flaiR__ is an R package for accessing the [flairNLP/flair](flairNLP/flair) Python library, maintained by [Yen-Chieh Liao](https://davidycliao.github.io) ([University of Birmingham](https://www.birmingham.ac.uk/research/centres-institutes/centre-for-artificial-intelligence-in-government)) and [Stefan Müller](https://muellerstefan.net) from [Next Generation Energy Systems](https://www.nexsys-energy.ie) and [Text and Policy Research Group](https://text-and-policy.com) in UCD. flaiR provides convenient access to the main functionalities of flairNLP for training word embedding-based deep learning models and fine-tune state-of-the-art transformers hosted on Hugging Face. Our team trains and fine-tunes the models with Flair in [our projects]().
@@ -50,10 +50,14 @@ install.packages("remotes") remotes::install_github("davidycliao/flaiR", force = TRUE) ``` -```{r} +```{r echo=TRUE, eval=FALSE} library(flaiR) ``` +``` r +#> flaiR: An R Wrapper for Accessing Flair NLP 0.13.1 +``` +
## Requirements @@ -75,8 +79,18 @@ __flaiR__ runs the Flair NLP backend in Python, thus requiring Python installati
+ +
+## Updates and News + +- [Tutorial for embeddings in flaiR](https://davidycliao.github.io/flaiR/articles/tutorial.html#embedding) + +- [Extending conText’s Embedding Regression](https://davidycliao.github.io/flaiR/articles/tutorial.html#extending-contexts-embedding-regression) + +
+ ## Contribution and Open Source @@ -93,6 +107,3 @@ The primary communication channel for R users can be found [here](https://github
- - - diff --git a/README.md b/README.md index 621b3100..80ff200c 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ **flaiR** is an R package for accessing the [flairNLP/flair](flairNLP/flair) Python library, maintained by -[Yen-ChiehLiao](https://davidycliao.github.io) ([University of +[Yen-Chieh Liao](https://davidycliao.github.io) ([University of Birmingham](https://www.birmingham.ac.uk/research/centres-institutes/centre-for-artificial-intelligence-in-government)) and [Stefan Müller](https://muellerstefan.net) from [Next Generation Energy Systems](https://www.nexsys-energy.ie) and [Text and Policy @@ -40,6 +40,10 @@ remotes::install_github("davidycliao/flaiR", force = TRUE) library(flaiR) ``` +``` r +#> flaiR: An R Wrapper for Accessing Flair NLP 0.13.1 +``` +
## Requirements @@ -71,6 +75,16 @@ these specific versions. +## Updates and News + +- [Tutorial for embeddings in + flaiR](https://davidycliao.github.io/flaiR/articles/tutorial.html#embedding) + +- [Extending conText’s Embedding + Regression](https://davidycliao.github.io/flaiR/articles/tutorial.html#extending-contexts-embedding-regression) + +
+ ## Contribution and Open Source
diff --git a/_pkgdown.yml b/_pkgdown.yml index 049e4a32..eec0482c 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -204,22 +204,22 @@ reference: # - flair_models()$ClusteringModel # - flair_models()$MultitaskModel -- title: Expanded Feats - desc:
Extended functionality for working with three major functions to extract features in data.table.
-- contents: - - load_tagger_ner - - load_tagger_pos - - load_tagger_sentiments - - get_entities - - get_entities_batch - - get_pos - - get_pos_batch - - get_sentiments - - get_sentiments_batch +# - title: Expanded Feats +# desc:
Extended functionality for working with three major functions to extract features in data.table.
+# - contents: +# - load_tagger_ner +# - load_tagger_pos +# - load_tagger_sentiments +# - get_entities +# - get_pos +# - get_sentiments +# - get_sentiments_batch -- title: Toolbox +- title: FlaiR Toolbox desc:
flairR utilities
- contents: + - predict_label + - process_embeddings - flair_device - highlight_text - map_entities @@ -228,6 +228,17 @@ reference: - show_flair_cache - uninstall_python_package - install_python_package + - get_tagger_tags + - load_tagger_ner + - load_tagger_pos + - get_entities + - get_pos + + + + + + - title: Training Dataset desc:
Training dataset for tutorial
@@ -266,12 +277,12 @@ navbar: - icon: fa-rocket text: Quick Start menu: + - text: "flaiR Installation" + href: articles/quickstart.html#flair-installation - text: "NLP Tasks" href: articles/quickstart.html#nlp-tasks - - text: "Class and Ojbect" - href: articles/quickstart.html#class-and-ojbect - - text: "More Details about Installation" - href: articles/quickstart.html#more-details-about-installation + - text: "Training and Finetuning" + href: articles/quickstart.html#training-and-fine-tuning - icon: fa-project-diagram @@ -285,27 +296,28 @@ navbar: href: articles/tutorial.html#sequence-taggings - text: "Performing NER Tasks" href: articles/tutorial.html#performing-ner-tasks - - text: "Flair Embeddings" - href: articles/tutorial.html#flair-embedding + - text: "Embeddings" + href: articles/tutorial.html#embedding - text: "Training a Binary Classifier" href: articles/tutorial.html#training-a-binary-classifier - text: "Training RNNs" href: articles/tutorial.html#training-rnns - text: "Finetune Transformers " href: articles/tutorial.html#finetune-transformers + - text: "Extending conText" + href: articles/tutorial.html#extending-contexts-embedding-regression - - icon: fa-newspaper-o - text: Expanded Feats - menu: - - text: "Part-of-speech Tagging" - href: articles/get_pos.html - - text: "Named Entity Recognition" - href: articles/get_entities.html - - text: "Tagging Sentiment" - href: articles/get_sentiments.html - - text: "The Coloring Entities" - href: articles/highlight_text.html - + # - icon: fa-newspaper-o + # text: Expanded Feats + # menu: + # - text: "Part-of-speech Tagging" + # href: articles/get_pos.html + # - text: "Named Entity Recognition" + # href: articles/get_entities.html + # - text: "Tagging Sentiment" + # href: articles/get_sentiments.html + # - text: "The Coloring Entities" + # href: articles/highlight_text.html - icon: fa-file-code-o text: Reference @@ -336,6 +348,8 @@ navbar: - icon: fa-newspaper-o text: News menu: + - text: "0.0.7" + href: news/index.html#flair-007-2024-12-26 - text: "0.0.6" href: news/index.html#flair-006-2023-10-29 - text: "0.0.5" diff --git a/data/.DS_Store b/data/.DS_Store new file mode 100644 index 00000000..5008ddfc Binary files /dev/null and b/data/.DS_Store differ diff --git a/man/check_tagger.Rd b/man/check_tagger.Rd new file mode 100644 index 00000000..1fcb6b81 --- /dev/null +++ b/man/check_tagger.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_entities.R +\name{check_tagger} +\alias{check_tagger} +\title{Check if Tagger is Valid} +\usage{ +check_tagger(tagger) +} +\arguments{ +\item{tagger}{A Flair tagger object to check} +} +\value{ +Logical indicating if the tagger is valid +} +\description{ +Internal function to verify if the provided tagger is valid +and has the required methods. +} +\keyword{internal} diff --git a/man/get_entities.Rd b/man/get_entities.Rd index 8edb583c..08fee104 100644 --- a/man/get_entities.Rd +++ b/man/get_entities.Rd @@ -2,15 +2,17 @@ % Please edit documentation in R/get_entities.R \name{get_entities} \alias{get_entities} -\title{Tagging Named Entities with Flair Models} +\title{Extract Named Entities from Texts with Batch Processing} \usage{ get_entities( texts, doc_ids = NULL, - tagger = NULL, - language = NULL, + tagger, show.text_id = FALSE, - gc.active = FALSE + gc.active = FALSE, + batch_size = 5, + device = "cpu", + verbose = FALSE ) } \arguments{ @@ -19,69 +21,125 @@ get_entities( \item{doc_ids}{A character or numeric vector containing the document IDs corresponding to each text.} -\item{tagger}{An optional tagger object. If NULL (default), the function will -load a Flair tagger based on the specified language.} - -\item{language}{A character string indicating the language model to load. -Default is "en".} +\item{tagger}{A Flair tagger object for named entity recognition. Must be provided +by the user. Can be created using load_tagger_ner() with different models: +\itemize{ +\item Standard NER: tagger_ner <- load_tagger_ner('ner') +\item OntoNotes: tagger_ner <- load_tagger_ner('flair/ner-english-ontonotes') +\item Large model: tagger_ner <- load_tagger_ner('flair/ner-english-large') +}} \item{show.text_id}{A logical value. If TRUE, includes the actual text from -which the entity was extracted in the resulting data table. Useful for -verification and traceability purposes but might increase the size of -the output. Default is FALSE.} +which the entity was extracted. Default is FALSE.} \item{gc.active}{A logical value. If TRUE, runs the garbage collector after -processing all texts. This can help in freeing up memory by releasing unused -memory space, especially when processing a large number of texts. -Default is FALSE.} +processing texts. Default is FALSE.} + +\item{batch_size}{An integer specifying the size of each batch. Set to 1 for +single-text processing. Default is 5.} + +\item{device}{A character string specifying the computation device ("cpu", +"cuda:0", "cuda:1", etc.). Default is "cpu". Note: MPS (Mac M1/M2) is currently +not fully supported and will default to CPU.} + +\item{verbose}{A logical value. If TRUE, prints processing progress. Default is FALSE.} } \value{ A data table with columns: \describe{ -\item{doc_id}{The ID of the document from which the entity was extracted.} -\item{text_id}{If TRUE, the actual text from which the entity -was extracted.} -\item{entity}{The named entity that was extracted from the text.} -\item{tag}{The tag or category of the named entity. Common tags include: -PERSON (names of individuals), -ORG (organizations, institutions), -GPE (countries, cities, states), -LOCATION (mountain ranges, bodies of water), -DATE (dates or periods), -TIME (times of day), -MONEY (monetary values), -PERCENT (percentage values), -FACILITY (buildings, airports), -PRODUCT (objects, vehicles), -EVENT (named events like wars or sports events), -ART (titles of books)}} +\item{doc_id}{Character or numeric. The ID of the document from which the +entity was extracted.} +\item{text_id}{Character. The complete text from which the entity was +extracted. Only included when show.text_id = TRUE.} +\item{entity}{Character. The actual named entity text that was extracted. +Will be NA if no entity was found.} +\item{tag}{Character. The category of the named entity. Available tags depend on +the model used: +\itemize{ +\item{Standard NER tags:} +\itemize{ +\item{PERSON: Names of people} +\item{ORG: Organizations} +\item{LOC: Locations} +\item{MISC: Miscellaneous entities} +} +\item{OntoNotes tags:} +\itemize{ +\item{PERSON: People, including fictional characters} +\item{ORG: Companies, agencies, institutions} +\item{GPE: Countries, cities, states} +\item{LOC: Non-GPE locations, mountains, water bodies} +\item{DATE: Absolute or relative dates} +\item{TIME: Times of day} +\item{MONEY: Monetary values} +\item{PERCENT: Percentage values} +\item{CARDINAL: Numerals} +\item{ORDINAL: Ordinal numbers} +\item{NORP: Nationalities, religious, or political groups} +\item{FAC: Buildings, airports, highways, bridges} +\item{WORK_OF_ART: Titles of books, songs, etc.} +\item{LAW: Named documents made into laws} +\item{LANGUAGE: Named languages} +} +}} +} } \description{ -This function takes texts and their corresponding document IDs -as inputs, uses the Flair NLP library to extract named entities, -and returns a dataframe of the identified entities along with their tags. -When no entities are detected in a text, the function returns a data table -with NA values. This might clutter the results. Depending on your use case, -you might decide to either keep this behavior or skip rows with no detected -entities. +This function processes texts in batches and extracts named entities +using the Flair NLP library. It supports both standard NER and OntoNotes models, +with options for batch processing and GPU acceleration. +} +\section{Tag Format}{ + +All tags use the BIOES (Begin, Inside, Outside, End, Single) scheme: +\itemize{ +\item{B-: Beginning of multi-token entity (e.g., B-PERSON in "John Smith")} +\item{I-: Inside of multi-token entity} +\item{O: Outside (not part of any entity)} +\item{E-: End of multi-token entity} +\item{S-: Single token entity (e.g., S-LOC in "Paris")} +} } + \examples{ \dontrun{ library(reticulate) library(fliaR) -texts <- c("UCD is one of the best universities in Ireland.", - "UCD has a good campus but is very far from - my apartment in Dublin.", - "Essex is famous for social science research.", - "Essex is not in the Russell Group, but it is - famous for political science research.", - "TCD is the oldest university in Ireland.", - "TCD is similar to Oxford.") -doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") -# Load NER ("ner") model -tagger_ner <- load_tagger_ner('ner') -results <- get_entities(texts, doc_ids, tagger_ner) -print(results)} +# Using standard NER model +tagger_std <- load_tagger_ner('ner') + +# Using OntoNotes model +tagger_onto <- load_tagger_ner('flair/ner-english-ontonotes') + +texts <- c( + "John Smith works at Google in New York.", + "The Eiffel Tower was built in 1889." +) +doc_ids <- c("doc1", "doc2") + +# Process with standard NER +results_std <- get_entities( + texts = texts, + doc_ids = doc_ids, + tagger = tagger_std, + batch_size = 2, + verbose = TRUE +) + +# Process with OntoNotes model +results_onto <- get_entities( + texts = texts, + doc_ids = doc_ids, + tagger = tagger_onto, + batch_size = 2, + verbose = TRUE +) + +# Filter specific entity types +persons <- results_onto[grepl("PERSON", tag)] +locations <- results_onto[grepl("LOC|GPE", tag)] +dates <- results_onto[grepl("DATE", tag)] +} } diff --git a/man/get_entities_batch.Rd b/man/get_entities_batch.Rd deleted file mode 100644 index e7bfe728..00000000 --- a/man/get_entities_batch.Rd +++ /dev/null @@ -1,76 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_entities_batch.R -\name{get_entities_batch} -\alias{get_entities_batch} -\title{Extract Named Entities from a Batch of Texts} -\usage{ -get_entities_batch( - texts, - doc_ids, - tagger = NULL, - language = "en", - show.text_id = FALSE, - gc.active = FALSE, - batch_size = 5, - device = "cpu", - verbose = TRUE -) -} -\arguments{ -\item{texts}{A character vector of texts to process.} - -\item{doc_ids}{A vector of document IDs corresponding to each text.} - -\item{tagger}{A pre-loaded Flair NER tagger. Default is NULL, and the tagger is loaded based on the provided language.} - -\item{language}{A character string specifying the language of the texts. Default is "en" (English).} - -\item{show.text_id}{Logical, whether to include the text ID in the output. Default is FALSE.} - -\item{gc.active}{Logical, whether to activate garbage collection after processing each batch. Default is FALSE.} - -\item{batch_size}{An integer specifying the size of each batch. Default is 5.} - -\item{device}{A character string specifying the computation device. -It can be either "cpu" or a string representation of a GPU device number. -For instance, "0" corresponds to the first GPU. If a GPU device number -is provided, it will attempt to use that GPU. The default is "cpu". -\itemize{ -\item{"cuda" or "cuda:0" ("mps" or "mps:0" in Mac M1/M2 )}{Refers to the first GPU in the system. If -there's only one GPU, specifying "cuda" or "cuda:0" will allocate -computations to this GPU.} -\item{"cuda:1" ("mps:1")}{Refers to the second GPU in the system, allowing allocation -of specific computations to this GPU.} -\item{"cuda:2" ("mps:2)}{Refers to the third GPU in the system, and so on for systems -with more GPUs.} -}} - -\item{verbose}{A logical value. If TRUE, the function prints batch processing -progress updates. Default is TRUE.} -} -\value{ -A data.table containing the extracted entities, their corresponding -tags, and document IDs. -} -\description{ -This function processes batches of texts and extracts named entities. -} -\examples{ -\dontrun{ -library(reticulate) -library(fliaR) - -texts <- c("UCD is one of the best universities in Ireland.", - "UCD has a good campus but is very far from - my apartment in Dublin.", - "Essex is famous for social science research.", - "Essex is not in the Russell Group, but it is - famous for political science research.", - "TCD is the oldest university in Ireland.", - "TCD is similar to Oxford.") -doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") -# Load NER ("ner") model -tagger_ner <- load_tagger_ner('ner') -results <- get_entities_batch(texts, doc_ids, tagger_ner) -print(results)} -} diff --git a/man/get_pos_batch.Rd b/man/get_pos_batch.Rd deleted file mode 100644 index 36384050..00000000 --- a/man/get_pos_batch.Rd +++ /dev/null @@ -1,86 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_pos_batch.R -\name{get_pos_batch} -\alias{get_pos_batch} -\title{Batch Process of Part-of-Speech Tagging} -\usage{ -get_pos_batch( - texts, - doc_ids, - tagger = NULL, - language = NULL, - show.text_id = FALSE, - gc.active = FALSE, - batch_size = 5, - device = "cpu", - verbose = TRUE -) -} -\arguments{ -\item{texts}{A character vector containing texts to be processed.} - -\item{doc_ids}{A character vector containing document ids.} - -\item{tagger}{A tagger object (default is NULL).} - -\item{language}{The language of the texts (default is NULL).} - -\item{show.text_id}{A logical value. If TRUE, includes the actual text from -which the entity was extracted in the resulting data table. Useful for -verification and traceability purposes but might increase the size of -the output. Default is FALSE.} - -\item{gc.active}{A logical value. If TRUE, runs the garbage collector after -processing all texts. This can help in freeing up memory by releasing unused -memory space, especially when processing a large number of texts. -Default is FALSE.} - -\item{batch_size}{An integer specifying the size of each batch. Default is 5.} - -\item{device}{A character string specifying the computation device. -\itemize{ -\item{"cuda" or "cuda:0" ("mps" or "mps:0" in Mac M1/M2 )}{Refers to the first GPU in the system. If -there's only one GPU, specifying "cuda" or "cuda:0" will allocate -computations to this GPU.} -\item{"cuda:1" ("mps:1")}{Refers to the second GPU in the system, allowing allocation -of specific computations to this GPU.} -\item{"cuda:2" ("mps:2)}{Refers to the third GPU in the system, and so on for systems -with more GPUs.} -}} - -\item{verbose}{A logical value. If TRUE, the function prints batch processing -progress updates. Default is TRUE.} -} -\value{ -A data.table containing the following columns: -\describe{ -\item{\code{doc_id}}{The document identifier corresponding to each text.} -\item{\code{token_id}}{The token number in the original text, -indicating the position of the token.} -\item{\code{text_id}}{The actual text input passed to the function (if show.text_id is TRUE).} -\item{\code{token}}{The individual word or token from the text that was -POS tagged.} -\item{\code{tag}}{The part-of-speech tag assigned to the token by -the Flair library.} -\item{\code{precision}}{A confidence score (numeric) for the -assigned POS tag.} -} -} -\description{ -This function returns a data table of POS tags and other related -data for the given texts using batch processing. -} -\examples{ -\dontrun{ -library(reticulate) -library(fliaR) -tagger_pos_fast <- load_tagger_pos('pos-fast') -texts <- c("UCD is one of the best universities in Ireland.", - "Essex is not in the Russell Group, but it is famous for political science research.", - "TCD is the oldest university in Ireland.") -doc_ids <- c("doc1", "doc2", "doc3") - -# Using the batch_size parameter -get_pos_batch(texts, doc_ids, tagger_pos_fast, batch_size = 2) -} -} diff --git a/man/get_sentiments.Rd b/man/get_sentiments.Rd deleted file mode 100644 index 99793dc1..00000000 --- a/man/get_sentiments.Rd +++ /dev/null @@ -1,71 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_sentiments.R -\name{get_sentiments} -\alias{get_sentiments} -\title{Tagging Sentiment with Flair Standard Models} -\usage{ -get_sentiments( - texts, - doc_ids, - tagger = NULL, - ..., - language = NULL, - show.text_id = FALSE, - gc.active = FALSE -) -} -\arguments{ -\item{texts}{A list or vector of texts for which sentiment prediction is -to be made.} - -\item{doc_ids}{A list or vector of document IDs corresponding to the texts.} - -\item{tagger}{An optional flair sentiment model. If NULL (default), -the function loads the default model based on the language.} - -\item{...}{Additional arguments passed to next.} - -\item{language}{A character string indicating the language of the texts. -Currently supports "sentiment" (English), "sentiment-fast" (English), and -"de-offensive-language" (German)} - -\item{show.text_id}{A logical value. If TRUE, includes the actual text from -which the sentiment was predicted. Default is FALSE.} - -\item{gc.active}{A logical value. If TRUE, runs the garbage collector after -processing all texts. This can help in freeing up memory by releasing unused -memory space, especially when processing a large number of texts. -Default is FALSE.} -} -\value{ -A \code{data.table} containing three columns: -\itemize{ -\item \code{doc_id}: The document ID from the input. -\item \code{sentiment}: Predicted sentiment for the text. -\item \code{score}: Score for the sentiment prediction. -} -} -\description{ -This function takes in texts and their associated document IDs -to predict sentiments using the flair Python library. -} -\examples{ -\dontrun{ -library(flaiR) -texts <- c("UCD is one of the best universities in Ireland.", - "UCD has a good campus but is very far from my apartment in Dublin.", - "Essex is famous for social science research.", - "Essex is not in the Russell Group, but it is famous for political science research.", - "TCD is the oldest university in Ireland.", - "TCD is similar to Oxford.") - -doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") - -# Load re-trained sentiment ("sentiment") model -tagger_sent <- load_tagger_sentiments('sentiment') - -results <- get_sentiments(texts, doc_ids, tagger_sent) -print(results) -} - -} diff --git a/man/get_sentiments_batch.Rd b/man/get_sentiments_batch.Rd deleted file mode 100644 index a19504d7..00000000 --- a/man/get_sentiments_batch.Rd +++ /dev/null @@ -1,97 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_sentiments_batch.R -\name{get_sentiments_batch} -\alias{get_sentiments_batch} -\title{Batch Process of Tagging Sentiment with Flair Models} -\usage{ -get_sentiments_batch( - texts, - doc_ids, - tagger = NULL, - ..., - language = NULL, - show.text_id = FALSE, - gc.active = FALSE, - batch_size = 5, - device = "cpu", - verbose = FALSE -) -} -\arguments{ -\item{texts}{A list or vector of texts for which sentiment prediction is -to be made.} - -\item{doc_ids}{A list or vector of document IDs corresponding to the texts.} - -\item{tagger}{An optional flair sentiment model. If NULL (default), -the function loads the default model based on the language.} - -\item{...}{Additional arguments passed to next.} - -\item{language}{A character string indicating the language of the texts. -Currently supports "sentiment" (English), "sentiment-fast" (English), and -"de-offensive-language" (German)} - -\item{show.text_id}{A logical value. If TRUE, includes the actual text from -which the sentiment was predicted. Default is FALSE.} - -\item{gc.active}{A logical value. If TRUE, runs the garbage collector after -processing all texts. This can help in freeing up memory by releasing unused -memory space, especially when processing a large number of texts. -Default is FALSE.} - -\item{batch_size}{An integer specifying the number of texts to be processed -at once. It can help optimize performance by leveraging parallel processing. -Default is 5.} - -\item{device}{A character string specifying the computation device. -It can be either "cpu" or a string representation of a GPU device number. -For instance, "0" corresponds to the first GPU. If a GPU device number -is provided, it will attempt to use that GPU. The default is "cpu". -\itemize{ -\item{"cuda" or "cuda:0" ("mps" or "mps:0" in Mac M1/M2 )}{Refers to the first GPU in the system. If -there's only one GPU, specifying "cuda" or "cuda:0" will allocate -computations to this GPU.} -\item{"cuda:1" ("mps:1")}{Refers to the second GPU in the system, allowing allocation -of specific computations to this GPU.} -\item{"cuda:2" ("mps:2)}{Refers to the third GPU in the system, and so on for systems -with more GPUs.} -}} - -\item{verbose}{A logical value. If TRUE, the function prints batch processing -progress updates. Default is TRUE.} -} -\value{ -A \code{data.table} containing three columns: -\itemize{ -\item \code{doc_id}: The document ID from the input. -\item \code{sentiment}: Predicted sentiment for the text. -\item \code{score}: Score for the sentiment prediction. -} -} -\description{ -This function takes in texts and their associated document IDs -to predict sentiments using the flair Python library. -} -\examples{ -\dontrun{ -library(flaiR) - - -texts <- c("UCD is one of the best universities in Ireland.", - "UCD has a good campus but is very far from my apartment in Dublin.", - "Essex is famous for social science research.", - "Essex is not in the Russell Group, but it is famous for political science research.", - "TCD is the oldest university in Ireland.", - "TCD is similar to Oxford.") - -doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") - -# Load re-trained sentiment ("sentiment") model -tagger_sent <- load_tagger_sentiments('sentiment') - -results <- get_sentiments_batch(texts, doc_ids, tagger_sent, batch_size = 3) -print(results) -} - -} diff --git a/man/get_tagger_tags.Rd b/man/get_tagger_tags.Rd new file mode 100644 index 00000000..b4189d38 --- /dev/null +++ b/man/get_tagger_tags.Rd @@ -0,0 +1,65 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/flair_loaders.R +\name{get_tagger_tags} +\alias{get_tagger_tags} +\title{Extract Model Tags} +\usage{ +get_tagger_tags(tagger) +} +\arguments{ +\item{tagger}{A loaded Flair SequenceTagger model} +} +\value{ +A list of tags grouped by category: +\describe{ +\item{all}{Complete list of all available tags} +\item{special}{Special tags like \if{html}{\out{}}, O, \if{html}{\out{}}, \if{html}{\out{}}} +\item{person}{Person-related tags (e.g., B-PER, I-PER)} +\item{organization}{Organization tags (e.g., B-ORG, E-ORG)} +\item{location}{Location tags (e.g., B-LOC, S-LOC)} +\item{misc}{Miscellaneous entity tags} +} +} +\description{ +Helper function to extract and categorize tags from a loaded Flair +SequenceTagger model. The tags are grouped into categories such as person, +organization, location, and miscellaneous. +} +\details{ +The tags follow the BIOES (Begin, Inside, Outside, End, Single) scheme: +\itemize{ +\item{B-: Beginning of multi-token entity (e.g., B-PER in "John Smith")} +\item{I-: Inside of multi-token entity (e.g., I-PER in "John Smith")} +\item{O: Outside of any entity} +\item{E-: End of multi-token entity} +\item{S-: Single token entity (e.g., S-LOC in "Paris")} +} +} +\examples{ +\dontrun{ +# Load a NER model +tagger <- load_tagger_ner("flair/ner-english-large") + +# Extract all tags +tags <- get_tagger_tags(tagger) + +# Access specific tag categories +print(tags$person) # All person-related tags +print(tags$location) # All location-related tags + +# Example usage with text annotation +# B-PER I-PER O S-ORG +# "John Smith works at Google" + +# B-LOC E-LOC O B-ORG E-ORG +# "New York is United Nations headquarters" + +# Use tags to filter entities +person_entities <- results[tag \%in\% tags$person] +org_entities <- results[tag \%in\% tags$organization] +} + +} +\seealso{ +\code{\link{load_tagger_ner}} for loading the NER model +} diff --git a/man/load_tagger_ner.Rd b/man/load_tagger_ner.Rd index ac1c447e..27ff5128 100644 --- a/man/load_tagger_ner.Rd +++ b/man/load_tagger_ner.Rd @@ -1,39 +1,22 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/flaiR_loader.R +% Please edit documentation in R/flair_loaders.R \name{load_tagger_ner} \alias{load_tagger_ner} -\title{Load the Named Entity Recognition (NER) Tagger} +\title{Load and Configure NER Tagger} \usage{ -load_tagger_ner(language = NULL) +load_tagger_ner(model_name = "ner", show_tags = TRUE) } \arguments{ -\item{language}{A character string indicating the desired language for the NER tagger. -If \code{NULL}, the function will default to the 'pos-fast' model. -Supported languages and their models include: -\itemize{ -\item \code{"en"} - English NER tagging (\code{ner}) -\item \code{"de"} - German NER tagging (\code{de-ner}) -\item \code{"fr"} - French NER tagging (\code{fr-ner}) -\item \code{"nl"} - Dutch NER tagging (\code{nl-ner}) -\item \code{"da"} - Danish NER tagging (\code{da-ner}) -\item \code{"ar"} - Arabic NER tagging (\code{ar-ner}) -\item \code{"ner-fast"} - English NER fast model (\code{ner-fast}) -\item \code{"ner-large"} - English NER large mode (\code{ner-large}) -\item \code{"de-ner-legal"} - NER (legal text) (\code{de-ner-legal}) -\item \code{"nl"} - Dutch NER tagging (\code{nl-ner}) -\item \code{"da"} - Danish NER tagging (\code{da-ner}) -\item \code{"ar"} - Arabic NER tagging (\code{ar-ner}) -}} +\item{model_name}{Character string specifying the model to load. +Can be "ner" (default), "flair/ner-english-large", or "flair/ner-english-ontonotes"} + +\item{show_tags}{Logical, whether to display the tag dictionary. +Default is TRUE.} } \value{ -An instance of the Flair SequenceTagger for the specified language. +A Flair SequenceTagger model object } \description{ -A helper function to load the appropriate tagger based on the provided language. -This function supports a variety of languages/models. -} -\examples{ -# Load the English NER tagger -tagger_en <- load_tagger_ner("en") - +Loads a Named Entity Recognition model from Flair and displays +its tag dictionary. Supports both standard NER and OntoNotes models. } diff --git a/man/load_tagger_pos.Rd b/man/load_tagger_pos.Rd index fe582c4d..bc2fcfb1 100644 --- a/man/load_tagger_pos.Rd +++ b/man/load_tagger_pos.Rd @@ -1,12 +1,18 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/flaiR_loader.R +% Please edit documentation in R/flair_loaders.R \name{load_tagger_pos} \alias{load_tagger_pos} \title{Load Flair POS Tagger} \usage{ -load_tagger_pos(language = NULL) +load_tagger_pos(model_name = "pos-fast", show_tags = TRUE) } \arguments{ +\item{model_name}{Character string specifying the model to load. +Default is "pos-fast".} + +\item{show_tags}{Logical, whether to display the tag dictionary. +Default is TRUE.} + \item{language}{A character string indicating the desired language model. If \code{NULL}, the function will default to the 'pos-fast' model. Supported language models include: \itemize{ @@ -28,10 +34,15 @@ the function will default to the 'pos-fast' model. Supported language models inc } \value{ A Flair POS tagger model corresponding to the specified (or default) language. + +A Flair tagger model object for POS tagging } \description{ This function loads the POS (part-of-speech) tagger model for a specified language using the Flair library. If no language is specified, it defaults to 'pos-fast'. + +Loads a Part-of-Speech tagging model from Flair and displays +its tag dictionary in organized categories. } \examples{ \dontrun{ diff --git a/man/load_tagger_sentiments.Rd b/man/load_tagger_sentiments.Rd deleted file mode 100644 index 8b07c18a..00000000 --- a/man/load_tagger_sentiments.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/flaiR_loader.R -\name{load_tagger_sentiments} -\alias{load_tagger_sentiments} -\title{Load a Sentiment or Language Tagger Model from Flair} -\usage{ -load_tagger_sentiments(language = NULL) -} -\arguments{ -\item{language}{A character string specifying the language model to load. -Supported models include: -\itemize{ -\item "sentiment" - Sentiment analysis model -\item "sentiment-fast" - Faster sentiment analysis model -\item "de-offensive-language" - German offensive language detection model -} If not provided, the function will default to the "sentiment" model.} -} -\value{ -An object of the loaded Flair model. -} -\description{ -This function loads a pre-trained sentiment or language tagger -from the Flair library. -} -\examples{ -\dontrun{ - tagger <- load_tagger_sentiments("sentiment") -} - -} diff --git a/man/predict_label.Rd b/man/predict_label.Rd new file mode 100644 index 00000000..0acd9b57 --- /dev/null +++ b/man/predict_label.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/predict_label.R +\name{predict_label} +\alias{predict_label} +\title{Predict Text Label Using Flair Classifier} +\usage{ +predict_label(text, classifier, sentence = NULL) +} +\arguments{ +\item{text}{A character string containing the text to be labeled} + +\item{classifier}{A Flair TextClassifier object for making predictions} + +\item{sentence}{Optional Flair Sentence object. If NULL, one will be created from text} +} +\value{ +A list containing: +\describe{ +\item{label}{Character string of predicted label} +\item{score}{Numeric confidence score from classifier} +\item{token_number}{Integer count of tokens in input text} +} +} +\description{ +Predict Text Label Using Flair Classifier +} +\examples{ +\dontrun{ +# Example 1: Using text input +classifier <- flair_models()$TextClassifier$load('stance-classifier') +result1 <- predict_label( + text = "I strongly support this policy", + classifier = classifier +) + +# Example 2: Using pre-created and tagged sentence +sent <- Sentence("I love Berlin and New York.") +tagger <- flair_models()$SequenceTagger$load('pos') +tagger$predict(sent) +print(sent) # Shows tokens with POS tags + +result2 <- predict_label( + text = NULL, + classifier = classifier, + sentence = sent +) +} + +} diff --git a/man/process_embeddings.Rd b/man/process_embeddings.Rd new file mode 100644 index 00000000..353cb04a --- /dev/null +++ b/man/process_embeddings.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/process_embeddings.R +\name{process_embeddings} +\alias{process_embeddings} +\title{Process Token Embeddings from Flair Sentence Object} +\usage{ +process_embeddings(sentence, verbose = FALSE) +} +\arguments{ +\item{sentence}{A Flair sentence object containing tokens with embeddings. +The sentence object should have a 'tokens' attribute, where each token +has both an 'embedding' (with numpy() method) and 'text' attribute.} + +\item{verbose}{Logical indicating whether to print progress messages. Default is FALSE.} +} +\value{ +A matrix where: +\itemize{ +\item Each row represents a token's embedding +\item Row names are the corresponding token texts +\item Columns represent the dimensions of the embedding vectors +} +} +\description{ +This function processes token embeddings from a Flair sentence object and converts them +into a matrix format with token names as row names. It handles the extraction of +embeddings from tokens, retrieval of token texts, and conversion to matrix format. +} +\details{ +The function will throw errors in the following cases: +\itemize{ +\item If sentence is NULL or has no tokens +\item If any token is missing an embedding +\item If any token is missing text +} +} +\examples{ +\dontrun{ +# Create a Flair sentence +sentence <- Sentence("example text") +WordEmbeddings <- flair_embeddings()$WordEmbeddings + +# Initialize FastText embeddings trained on Common Crawl +fasttext_embeddings <- WordEmbeddings('en-crawl') + +# Apply embeddings +fasttext_embeddings$embed(sentence) + +# Process embeddings with timing and messages +embedding_matrix <- process_embeddings(sentence, verbose = TRUE) +} + +} diff --git a/renv.lock b/renv.lock index 4fd5d90a..29f8c376 100644 --- a/renv.lock +++ b/renv.lock @@ -11,9 +11,9 @@ "Packages": { "Matrix": { "Package": "Matrix", - "Version": "1.6-5", + "Version": "1.7-1", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "R", "grDevices", @@ -24,18 +24,18 @@ "stats", "utils" ], - "Hash": "8c7115cd3a0e048bda2a7cd110549f7a" + "Hash": "5122bb14d8736372411f955e1b16bc8a" }, "Rcpp": { "Package": "Rcpp", - "Version": "1.0.13", + "Version": "1.0.13-1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "methods", "utils" ], - "Hash": "f27411eb6d9c3dada5edd444b8416675" + "Hash": "6b868847b365672d6c1677b1608da9ed" }, "RcppTOML": { "Package": "RcppTOML", @@ -81,24 +81,24 @@ }, "curl": { "Package": "curl", - "Version": "5.2.2", + "Version": "6.0.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R" ], - "Hash": "8f27335f2bcff4d6035edcc82d7d46de" + "Hash": "e8ba62486230951fcd2b881c5be23f96" }, "data.table": { "Package": "data.table", - "Version": "1.16.0", + "Version": "1.16.4", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", "methods" ], - "Hash": "fb24e05d4a91d8b1c7ff8e284bde834a" + "Hash": "38bbf05fc2503143db4c734a7e5cab66" }, "digest": { "Package": "digest", @@ -120,20 +120,20 @@ }, "glue": { "Package": "glue", - "Version": "1.7.0", + "Version": "1.8.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", "methods" ], - "Hash": "e0b3a53876554bd45879e596cdb10a52" + "Hash": "5899f1eaa825580172bb56c08266f37c" }, "here": { "Package": "here", "Version": "1.0.1", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "rprojroot" ], @@ -157,13 +157,13 @@ }, "jsonlite": { "Package": "jsonlite", - "Version": "1.8.8", + "Version": "1.8.9", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "methods" ], - "Hash": "e1b9c55281c5adc4dd113652d9e26768" + "Hash": "4e993b65c2c3ffbffce7bb3e2c6f832b" }, "lattice": { "Package": "lattice", @@ -207,7 +207,7 @@ "Package": "png", "Version": "0.1-8", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "R" ], @@ -225,17 +225,17 @@ }, "renv": { "Package": "renv", - "Version": "1.0.7", + "Version": "1.0.11", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "utils" ], - "Hash": "397b7b2a265bc5a7a06852524dabae20" + "Hash": "47623f66b4e80b3b0587bc5d7b309888" }, "reticulate": { "Package": "reticulate", - "Version": "1.39.0", + "Version": "1.40.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -253,7 +253,7 @@ "utils", "withr" ], - "Hash": "e1a5d04397edc1580c5e0ed1dbdccf76" + "Hash": "04740f615607c4e6099356ff6d6694ee" }, "rlang": { "Package": "rlang", @@ -270,7 +270,7 @@ "Package": "rprojroot", "Version": "2.0.4", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "R" ], @@ -322,7 +322,7 @@ }, "withr": { "Package": "withr", - "Version": "3.0.1", + "Version": "3.0.2", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -330,7 +330,7 @@ "grDevices", "graphics" ], - "Hash": "07909200e8bbe90426fbfeb73e1e27aa" + "Hash": "cc2d62c76458d425210d1eb1478b30b4" } } } diff --git a/renv/activate.R b/renv/activate.R index d13f9932..0eb51088 100644 --- a/renv/activate.R +++ b/renv/activate.R @@ -2,7 +2,7 @@ local({ # the requested version of renv - version <- "1.0.7" + version <- "1.0.11" attr(version, "sha") <- NULL # the project directory @@ -98,6 +98,66 @@ local({ unloadNamespace("renv") # load bootstrap tools + ansify <- function(text) { + if (renv_ansify_enabled()) + renv_ansify_enhanced(text) + else + renv_ansify_default(text) + } + + renv_ansify_enabled <- function() { + + override <- Sys.getenv("RENV_ANSIFY_ENABLED", unset = NA) + if (!is.na(override)) + return(as.logical(override)) + + pane <- Sys.getenv("RSTUDIO_CHILD_PROCESS_PANE", unset = NA) + if (identical(pane, "build")) + return(FALSE) + + testthat <- Sys.getenv("TESTTHAT", unset = "false") + if (tolower(testthat) %in% "true") + return(FALSE) + + iderun <- Sys.getenv("R_CLI_HAS_HYPERLINK_IDE_RUN", unset = "false") + if (tolower(iderun) %in% "false") + return(FALSE) + + TRUE + + } + + renv_ansify_default <- function(text) { + text + } + + renv_ansify_enhanced <- function(text) { + + # R help links + pattern <- "`\\?(renv::(?:[^`])+)`" + replacement <- "`\033]8;;ide:help:\\1\a?\\1\033]8;;\a`" + text <- gsub(pattern, replacement, text, perl = TRUE) + + # runnable code + pattern <- "`(renv::(?:[^`])+)`" + replacement <- "`\033]8;;ide:run:\\1\a\\1\033]8;;\a`" + text <- gsub(pattern, replacement, text, perl = TRUE) + + # return ansified text + text + + } + + renv_ansify_init <- function() { + + envir <- renv_envir_self() + if (renv_ansify_enabled()) + assign("ansify", renv_ansify_enhanced, envir = envir) + else + assign("ansify", renv_ansify_default, envir = envir) + + } + `%||%` <- function(x, y) { if (is.null(x)) y else x } @@ -142,7 +202,10 @@ local({ # compute common indent indent <- regexpr("[^[:space:]]", lines) common <- min(setdiff(indent, -1L)) - leave - paste(substring(lines, common), collapse = "\n") + text <- paste(substring(lines, common), collapse = "\n") + + # substitute in ANSI links for executable renv code + ansify(text) } @@ -305,8 +368,11 @@ local({ quiet = TRUE ) - if ("headers" %in% names(formals(utils::download.file))) - args$headers <- renv_bootstrap_download_custom_headers(url) + if ("headers" %in% names(formals(utils::download.file))) { + headers <- renv_bootstrap_download_custom_headers(url) + if (length(headers) && is.character(headers)) + args$headers <- headers + } do.call(utils::download.file, args) @@ -385,10 +451,21 @@ local({ for (type in types) { for (repos in renv_bootstrap_repos()) { + # build arguments for utils::available.packages() call + args <- list(type = type, repos = repos) + + # add custom headers if available -- note that + # utils::available.packages() will pass this to download.file() + if ("headers" %in% names(formals(utils::download.file))) { + headers <- renv_bootstrap_download_custom_headers(repos) + if (length(headers) && is.character(headers)) + args$headers <- headers + } + # retrieve package database db <- tryCatch( as.data.frame( - utils::available.packages(type = type, repos = repos), + do.call(utils::available.packages, args), stringsAsFactors = FALSE ), error = identity @@ -470,6 +547,14 @@ local({ } + renv_bootstrap_github_token <- function() { + for (envvar in c("GITHUB_TOKEN", "GITHUB_PAT", "GH_TOKEN")) { + envval <- Sys.getenv(envvar, unset = NA) + if (!is.na(envval)) + return(envval) + } + } + renv_bootstrap_download_github <- function(version) { enabled <- Sys.getenv("RENV_BOOTSTRAP_FROM_GITHUB", unset = "TRUE") @@ -477,16 +562,16 @@ local({ return(FALSE) # prepare download options - pat <- Sys.getenv("GITHUB_PAT") - if (nzchar(Sys.which("curl")) && nzchar(pat)) { + token <- renv_bootstrap_github_token() + if (nzchar(Sys.which("curl")) && nzchar(token)) { fmt <- "--location --fail --header \"Authorization: token %s\"" - extra <- sprintf(fmt, pat) + extra <- sprintf(fmt, token) saved <- options("download.file.method", "download.file.extra") options(download.file.method = "curl", download.file.extra = extra) on.exit(do.call(base::options, saved), add = TRUE) - } else if (nzchar(Sys.which("wget")) && nzchar(pat)) { + } else if (nzchar(Sys.which("wget")) && nzchar(token)) { fmt <- "--header=\"Authorization: token %s\"" - extra <- sprintf(fmt, pat) + extra <- sprintf(fmt, token) saved <- options("download.file.method", "download.file.extra") options(download.file.method = "wget", download.file.extra = extra) on.exit(do.call(base::options, saved), add = TRUE) diff --git a/requirements.txt b/requirements.txt index 49d285fb..e6b8dfe6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -accelerate==0.28.0 +accelerate==1.2.0 beautifulsoup4==4.12.3 boto3==1.35.23 botocore==1.35.44 @@ -18,26 +18,27 @@ ftfy==6.1.3 future==0.18.3 gdown==5.2.0 gensim==4.3.3 -huggingface-hub==0.26.0 +huggingface-hub==0.26.2 hyperopt==0.2.7 -idna==3.7 +idna==3.10 imageio==2.35.1 Janome==0.5.0 -Jinja2==3.1.4 +Jinja2==3.1.5 jmespath==1.0.1 joblib==1.3.2 kiwisolver==1.4.7 langdetect==1.0.9 -lazy_loader==0.3 +lazy_loader==0.4 lime==0.2.0.1 lxml==5.3.0 -MarkupSafe==2.1.3 -matplotlib==3.9.2 +lxml==4.9.3 +MarkupSafe==3.0.2 +matplotlib==3.10.0 more-itertools==10.5.0 mpld3==0.5.10 mpmath==1.3.0 -networkx==3.2.1 -numpy==1.26.2 +networkx==3.4.2 +numpy==2.2.0 packaging==24.1 pandas==2.2.3 Pillow==10.2.0 @@ -63,15 +64,15 @@ semver==3.0.2 sentencepiece==0.1.99 six==1.16.0 smart-open==7.0.4 -soupsieve==2.5 +soupsieve==2.6 sqlitedict==2.1.0 sympy==1.13.3 tabulate==0.9.0 threadpoolctl==3.2.0 tifffile==2024.9.20 -tokenizers==0.20.1 -torch==2.5.0 -torchvision==0.20.0 +tokenizers==0.21.0 +torch==2.5.1 +torchvision==0.20.1 tqdm==4.66.5 transformer-smaller-training-vocab==0.4.0 transformers==4.41.1 @@ -79,5 +80,5 @@ typing_extensions==4.9.0 tzdata==2024.2 urllib3==2.2.2 wcwidth==0.2.13 -Wikipedia-API==0.6.0 +Wikipedia-API==0.7.1 wrapt==1.16.0 diff --git a/tests/testthat/test_get_entities.R b/tests/testthat/test_get_entities.R index 249d252d..9250da46 100644 --- a/tests/testthat/test_get_entities.R +++ b/tests/testthat/test_get_entities.R @@ -1,100 +1,88 @@ -# Test 1: get_entities returns four entities for two input texts using "ner" -test_that("get_entities returns four entities for two input texts using 'ner'", { - result <- get_entities( - texts = c("UCD is one of the best university in Ireland. ", - "TCD in less better than Oxford"), - doc_ids = c("doc1", "doc2"), - language = "ner" +test_that("get_entities works with standard NER model", { + + texts <- c( + "John Smith works at Google in New York.", + "The Eiffel Tower was built in 1889." ) - # Check that the number of rows in the result matches the expected number of entities - expect_equal(nrow(result), 4) -}) + doc_ids <- c("doc1", "doc2") -# Test 2: get_entities_batch returns four entities for two input texts. -test_that("get_entities_batch returns four entities for two input texts using 'ner'", { - result <- get_entities_batch( - texts = c("UCD is one of the best university in Ireland. ", - "TCD in less better than Oxford"), - doc_ids = c("doc1", "doc2"), - language = "ner", - batch_size = 5, - device = "cpu" + # 加載標準 NER 模型 + tagger_std <- load_tagger_ner('ner') + + # 基本功能測試 + results <- get_entities( + texts = texts, + doc_ids = doc_ids, + tagger = tagger_std, + batch_size = 2 ) - # Check that the number of rows in the result matches the expected number of entities - expect_equal(nrow(result), 4) + + # 測試返回值結構 + expect_true(is.data.frame(results)) + + # 測試提取的實體 + expect_true(any(grepl("John Smith", results$entity))) + expect_true(any(grepl("Google", results$entity))) + expect_true(any(grepl("New York", results$entity))) + + expected_tags <- c("PER", "ORG", "LOC") + expect_true(all(results$tag[results$tag != "O"] %in% expected_tags)) }) +test_that("get_entities handles different parameters correctly", { -# Test 3: get_entities throws an error for mismatched lengths of texts and doc_ids -test_that("get_entities throws an error for mismatched lengths of texts and doc_ids", { - expect_error( - get_entities( - texts = "TCD in less better than Oxford", - doc_ids = c("doc1", "doc2"), - language = "ner" - ), - "The lengths of texts and doc_ids do not match." + text <- "John lives in Berlin." + tagger_std <- load_tagger_ner('ner') + + result_with_text <- get_entities( + texts = text, + doc_ids = 1, + tagger = tagger_std, + show.text_id = TRUE ) -}) + expect_true("text_id" %in% names(result_with_text)) -# Test 4: get_entities returns NA for the "tag" field when there are mismatched -# lengths of texts and doc_ids -test_that("get_entities returns NA for the 'tag' field.", { - result <- get_entities( - texts = "TCD in less better than Oxford", - doc_ids = "doc1", - language = "ner" + # 測試批次大小 + result_batch <- get_entities( + texts = rep(text, 3), + doc_ids = c(1, 2, 3), + tagger = tagger_std, + batch_size = 2 ) - # Check that the "tag" field is NA - expect_true(is.na(result[3, "tag"]$tag)) -}) + expect_true(nrow(result_batch) >= 2) -# Test 5: get_entities returns NA for the "tag" field when the input text -# does not contain entities -test_that("get_entities returns NA for the 'tag' field", { - result <- get_entities( - texts = "xxxxxxxxx", + result_with_ids <- get_entities( + texts = text, doc_ids = "doc1", - language = "ner" + tagger = tagger_std ) - # Check that the "tag" field is NA - expect_true(is.na(result[1, "tag"]$tag)) + expect_equal(unique(result_with_ids$doc_id), "doc1") }) -# Test 6: get_entities returns NA for the "tag" field when the input text is NA -test_that("get_entities returns NA for the 'tag' field.", { - result <- get_entities( - texts = NA, - doc_ids = NA, - language = "ner" +test_that("get_entities error handling", { + tagger_std <- load_tagger_ner('ner') + + expect_error( + get_entities(texts = character(0), tagger = tagger_std), + "The texts cannot be NULL or empty." ) - # Check that the "tag" field is NA - expect_true(is.na(result[1, "tag"]$tag)) -}) -# Test 7: get_entities returns NA for the "tag" field when the input text is -# NA and show.text_id is TRUE -test_that("get_entities returns NA for the 'tag' field", { - result <- get_entities( - texts = NA, - doc_ids = NA, - show.text_id = TRUE, - gc.active = TRUE, - language = "ner" + # 測試 texts 和 doc_ids 長度不匹配 + expect_error( + get_entities( + texts = c("text1", "text2"), + doc_ids = "doc1", + tagger = tagger_std + ), + "The lengths of texts and doc_ids do not match." ) - # Check that the "tag" field is NA - expect_true(is.na(result[1, "tag"]$tag)) -}) -# Test 8: get_entities returns the correct entity tag "ORG" for an input text -test_that("get_entities returns the correct entity tag 'ORG' for an input text", { - result <- get_entities( - texts = "TCD in less better than Oxford", - doc_ids = "doc1", - show.text_id = TRUE, - gc.active = TRUE, - language = "ner" + expect_error( + get_entities( + texts = "text", + tagger = tagger_std, + batch_size = 0 + ), + "Invalid batch size. It must be a positive integer." ) - # Check that the entity tag is "ORG" - expect_equal(result[1, "tag"]$tag, "ORG") }) diff --git a/tests/testthat/test_get_entities_batch.R b/tests/testthat/test_get_entities_batch.R deleted file mode 100644 index 001eda32..00000000 --- a/tests/testthat/test_get_entities_batch.R +++ /dev/null @@ -1,51 +0,0 @@ - -test_that("batch_texts and batch_doc_ids lengths mismatch", { - expect_error(get_entities_batch(texts = c("text1", "text2"), - doc_ids = "id1", - show.text_id = FALSE), - "The lengths of texts and doc_ids do not match.") -}) - -test_that("NA values for text or doc_id", { - result <- get_entities_batch(texts = c("text1", NA), - doc_ids = c("id1", "id2"), - show.text_id = FALSE) - expect_equal(typeof(result$doc_id[2]), "character") - expect_equal(result$entity[2], NA) - expect_equal(result$tag[2], NA) -}) - -test_that("No entities detected", { - # Assuming that the tagger returns no entities for "text_without_entity" - result <- get_entities_batch(texts = "text_without_entity", - doc_ids = "id1", - show.text_id = FALSE) - expect_equal(result$doc_id[1], "id1") - expect_equal(result$entity[1], NA) - expect_equal(result$tag[1], NA) -}) - -test_that("Inclusion of doc_id when show.text_id is TRUE", { - result <- get_entities_batch(texts = "text1", - doc_ids = "id1", - show.text_id = TRUE) - expect_true("text_id" %in% colnames(result)) - expect_equal(result$text_id[1], "text1") -}) - - -test_that("Mismatched lengths of batch_texts and batch_doc_ids raise an error", { - expect_error(get_entities_batch(c("Hello", "World"), "doc1", show.text_id = TRUE), - "The lengths of texts and doc_ids do not match.") -}) - -test_that("show.text_id = TRUE adds a text_id column", { - result <- get_entities_batch(c("Hello", "World"), c("doc1", "doc2"), show.text_id = TRUE) - expect_true("text_id" %in% names(result)) -}) - -test_that("show.text_id = FALSE does not add a text_id column", { - result <- get_entities_batch(c("Hello", "World"), c("doc1", "doc2"), show.text_id = TRUE) - expect_true("text_id" %in% colnames(result)) -}) - diff --git a/tests/testthat/test_get_pos.R b/tests/testthat/test_get_pos.R index 222158c3..8ba967a7 100644 --- a/tests/testthat/test_get_pos.R +++ b/tests/testthat/test_get_pos.R @@ -92,18 +92,19 @@ test_that("get_pos_batch throws an error for no valid texts provided.", { }) # Test 8: test loading tagger works as expected - test_that("loading tagger works as expected", { - # Assuming you have a valid tagger object for English - valid_tagger <- load_tagger_pos("pos-fast") + expected_messages <- c( + "Model name not specified. Using default 'pos-fast' model.", + "Loading POS tagger model: pos-fast" + ) - # tagger is NULL and no language is specified - expect_message(get_pos("Hello World", "doc1"), "Language is not specified. pos-fastin Flair is forceloaded. Please ensure that the internet connectivity is stable.") + messages <- capture_messages(get_pos("Hello World", "doc1")) - # tagger is NULL but a language is specified - expect_silent(get_pos("Hello World", "doc1", language = "pos")) - # - # a valid tagger object is passed - expect_silent(get_pos("Hello World", "doc1", tagger = valid_tagger)) + # 檢查每個期望的訊息是否都存在 + for(expected in expected_messages) { + expect_true( + any(grepl(expected, messages, fixed = TRUE)), + info = paste("Expected message not found:", expected) + ) + } }) - diff --git a/tests/testthat/test_get_pos_batch.R b/tests/testthat/test_get_pos_batch.R deleted file mode 100644 index 34d78eec..00000000 --- a/tests/testthat/test_get_pos_batch.R +++ /dev/null @@ -1,43 +0,0 @@ -# Test 1: get_pos_batch returns pos tags for three input texts using a custom tagger -test_that("get_pos_batch returns pos tags for three input texts using a custom tagger", { - result <- get_pos_batch( - texts = c("UCD is one of the best universities in Ireland.", - "Essex is famous in social science research", - "TCD is the oldest one in Ireland."), - doc_ids = c("doc1", "doc2", "doc3"), - tagger = load_tagger_pos(), - language = "upos-fast", - batch_size = 1 - ) - # Check that the number of rows in the result matches the number of tokens - expect_equal(nrow(result), 25) -}) - -# Test 2: get_pos_batch returns pos tags for three input texts using a custom tagger -test_that("texts and doc_ids are 0 and get_pos_batch returns NA", { - result <- get_pos_batch( - texts = "", - doc_ids = "", - tagger = load_tagger_pos(), - language = "upos-fast" , - batch_size = 1) - # Check that the number of rows in the result matches the number of tokens - expect_equal(nrow(result), 1) -}) - - -# Test 2: test loading tagger in get_pos_batch works as expected - -test_that("loading tagger works as expected", { - # Assuming you have a valid tagger object for English - valid_tagger <- load_tagger_pos("pos-fast") - - # Test 1: tagger is NULL and no language is specified - expect_message(get_pos_batch("Hello Dublin", batch_size = 1, doc_ids = "doc1"), "Language is not specified. pos-fastin Flair is forceloaded. Please ensure that the internet connectivity is stable.", fixed = FALSE) - - # Test 2: tagger is NULL but a language is specified - expect_equal(get_pos_batch(texts = "Hello Ireland", batch_size = 1, doc_ids = "doc1", language = "pos-fast")[1, "tag"]$tag, "UH") - # - # Test 3: a valid tagger object is passed - expect_message(get_pos_batch("Hello Colchester", batch_size = 1, doc_ids = "doc1", tagger = valid_tagger), "CPU is used.") -}) diff --git a/tests/testthat/test_get_sentiments.R b/tests/testthat/test_get_sentiments.R deleted file mode 100644 index 6a11cb77..00000000 --- a/tests/testthat/test_get_sentiments.R +++ /dev/null @@ -1,91 +0,0 @@ -# Test 1: get_sentiments returns sentiment scores for two input texts -test_that("get_sentiments returns sentiment scores for two input texts", { - result <- get_sentiments( - texts = c("UCD is one of the best universities in Ireland.", - "TCD is less better than Oxford."), - doc_ids = c("doc1", "doc2"), - language = "sentiment" - ) - # Check that the number of rows in the result matches the number of texts - expect_equal(nrow(result), 2) -}) - - -# Test 2: get_sentiments returns sentiment scores for two input texts -test_that("get_sentiments returns sentiment scores for two input texts", { - result <- get_sentiments_batch( - texts = c("UCD is one of the best universities in Ireland.", - "TCD is less better than Oxford."), - doc_ids = c("doc1", "doc2"), - language = "sentiment" - ) - # Check that the number of rows in the result matches the number of texts - expect_equal(nrow(result), 2) -}) - -# Test 3: get_sentiments returns sentiment scores using a custom tagger -test_that("get_sentiments returns sentiment scores using a custom tagger", { - result <- get_sentiments( - texts = c("UCD is one of the best universities in Ireland.", - "TCD is less better than Oxford."), - doc_ids = c("doc1", "doc2"), - tagger = load_tagger_sentiments(), - language = "sentiment" - ) - # Check that the number of rows in the result matches the number of texts - expect_equal(nrow(result), 2) -}) - -# Test 4: get_sentiments throws an error for mismatched lengths of texts and doc_ids -test_that("get_sentiments throws an error for mismatched lengths of texts and doc_ids", { - expect_error( - get_sentiments( - texts = "TCD in less better than Oxford", - doc_ids = c("doc1", "doc2"), - language = "sentiment" - ), - "The lengths of texts and doc_ids do not match." - ) -}) - -# Test 5: get_sentiments handles NA values and returns NA for sentiment scores -test_that("get_sentiments handles NA values and returns NA for sentiment scores", { - result <- get_sentiments( - texts = NA, - doc_ids = NA, - show.text_id = TRUE, - gc.active = TRUE, - language = "sentiment" - ) - # Check that the sentiment score is NA - expect_true(is.na(result[1, "score"]$score)) -}) - - -# Test 6: get_sentiments returns "The lengths of texts and doc_ids do not match. -test_that("get_sentiments returns The lengths of texts and doc_ids do not match.", { - expect_error(get_sentiments( - texts = "TCD in less better than Oxford", "Essex is in Colchester", - doc_ids = c("doc1", "doc2"), - show.text_id = TRUE, - gc.active = TRUE, - language = "sentiment" - ) - , "The lengths of texts and doc_ids do not match.") -}) - -# Test 7: get_sentiments with empty input returns NA for score -test_that("get_sentiments with empty input returns NA for score", { - # Call get_sentiments with empty input - result <- get_sentiments( - texts = "", - doc_ids = "", - show.text_id = TRUE, - gc.active = TRUE, - language = "sentiment" - ) - # Check that the result has one row and the "score" column is NA - expect_equal(nrow(result), 1) - expect_true(is.na(result$score)) -}) - diff --git a/tests/testthat/test_get_sentiments_batch.R b/tests/testthat/test_get_sentiments_batch.R deleted file mode 100644 index cef8fa5c..00000000 --- a/tests/testthat/test_get_sentiments_batch.R +++ /dev/null @@ -1,37 +0,0 @@ -# Test 1: Ensure the length of texts and doc_ids are the same. - -test_that("Ensure the length of texts and doc_ids are the same", { - expect_error(get_sentiments_batch(texts = c("text1", "text2"), - doc_ids = "id1"), - "The lengths of texts and doc_ids do not match.") -}) - -# Test 2: Check if texts is empty after removing NAs and empty texts -test_that("Check if texts is empty after removing NAs and empty texts", { - expect_message(get_sentiments_batch(texts = c("", NA, NA), - doc_ids = c("id1", "id2", "id3")), - "CPU is used.", fixed = FALSE) -}) - -# Test 3: Check sentiment and score for specific input text - -test_that("Check sentiment and score for specific input text", { - result <- get_sentiments_batch(texts = "some_text_without_sentiment", - doc_ids = "id1") - - expect_equal(result$sentiment[1], "NEGATIVE") - expect_equal(result$score[1], 0.9968621, tolerance = 0.0001) -}) - -# Test 4. text_id` is added to the result only if `show.text_id` is TRUE -test_that("`text_id` is added to the result only if `show.text_id` is TRUE", { - result_with_text_id <- get_sentiments_batch(texts = "some_text", - doc_ids = "id1", - show.text_id = TRUE) - expect_true("text_id" %in% names(result_with_text_id)) - - result_without_text_id <- get_sentiments_batch(texts = "some_text", - doc_ids = "id1", - show.text_id = FALSE) - expect_false("text_id" %in% names(result_without_text_id)) -}) diff --git a/tests/testthat/test_util.R b/tests/testthat/test_util.R index 53855e95..1c24ec3a 100644 --- a/tests/testthat/test_util.R +++ b/tests/testthat/test_util.R @@ -1,13 +1,13 @@ # Test 1: Ensure that `get_entities` provides the expected row count when # provided with specific texts, doc_ids, and a pre-loaded tagger. -test_that("get_entities throws an error for unsupported languages under no internet condition", { - expect_equal(nrow(get_entities( - texts = c("UCD is one of the best university in Ireland. ", - "TCD in less better than Oxford"), - doc_ids= c("doc1", "doc2"), load_tagger_ner("en"))), - 4) -}) - +# test_that("get_entities throws an error for unsupported languages under no internet condition", { +# expect_equal(nrow(get_entities( +# texts = c("UCD is one of the best university in Ireland. ", +# "TCD in less better than Oxford"), +# doc_ids= c("doc1", "doc2"), load_tagger_ner("en"))), +# 4) +# }) +# # Test 2: Similar test as above, but without explicitly specifying the language # for the tagger. This tests the default behavior. @@ -115,20 +115,3 @@ test_that("check_texts_and_ids handles input correctly", { expect_equal(res$doc_ids, c("id1", "id2")) }) - -# Test 14: check_flair_installed identifies whether flair is available - -test_that("check_flair_installed identifies whether flair is available", { - # Mocking that the module is available - with_mock( - `reticulate::py_module_available` = function(...) TRUE, - expect_true(check_flair_installed()) - ) - - # Mocking that the module is not available - with_mock( - `reticulate::py_module_available` = function(...) FALSE, - expect_false(check_flair_installed()) - ) -}) - diff --git a/vignettes/get_entities.Rmd b/vignettes/get_entities.Rmd index 6a4a7745..45d42b53 100644 --- a/vignettes/get_entities.Rmd +++ b/vignettes/get_entities.Rmd @@ -23,7 +23,6 @@ system(paste(Sys.which("python3"), "-m pip install --upgrade pip")) system(paste(Sys.which("python3"), "-m pip install torch")) system(paste(Sys.which("python3"), "-m pip install flair")) library(reticulate) -library(flaiR) # # system(paste(reticulate::py_config()$python, "-m pip install flair")) # reticulate::py_install("flair") ``` @@ -101,28 +100,3 @@ print(results) ```{r} print(results) ``` - -## Batch Processing - -
-Processing texts individually can be both inefficient and memory-intensive. On the other hand, processing all the texts simultaneously could surpass memory constraints, especially if each document in the dataset is sizable. Parsing the documents in smaller batches may provide an optimal compromise between these two scenarios. Batch processing can enhance efficiency and aid in memory management. - -By default, the batch_size parameter is set to 5. You can consider starting with this default value and then experimenting with different batch sizes to find the one that works best for your specific use case. You can monitor memory usage and processing time to help you make a decision. If you have access to a GPU, you might also try larger batch sizes to take advantage of GPU parallelism. However, be cautious not to set the batch size too large, as it can lead to out-of-memory errors. Ultimately, the choice of batch size should be based on a balance between memory constraints, processing efficiency, and the specific requirements of your entity extraction task. - -
- -```{r} -batch_process_time <- system.time({ - batch_process_results <- get_entities_batch(uk_immigration$text, - uk_immigration$speaker, - tagger_ner, - show.text_id = FALSE, - batch_size = 5) - gc() -}) -print(batch_process_time) -``` - -```{r} -print(batch_process_results) -``` diff --git a/vignettes/get_pos.Rmd b/vignettes/get_pos.Rmd index 752cf18d..0f023731 100644 --- a/vignettes/get_pos.Rmd +++ b/vignettes/get_pos.Rmd @@ -86,23 +86,3 @@ print(results) ```
-## Batch Processing -
- -By default, the batch_size parameter is set to 5. You can consider starting with this default value and then experimenting with different batch sizes to find the one that works best for your specific use case. You can monitor memory usage and processing time to help you make a decision. If you have access to a GPU, you might also try larger batch sizes to take advantage of GPU parallelism. However, be cautious not to set the batch size too large, as it can lead to out-of-memory errors. Ultimately, the choice of batch size should be based on a balance between memory constraints, processing efficiency, and the specific requirements of your entity extraction task. - -
- -```{r} -batch_process_results <- get_pos_batch(uk_immigration$text, - uk_immigration$speaker, - tagger_pos, - show.text_id = FALSE, - batch_size = 10, - verbose = TRUE) -print(batch_process_results) -``` - - - - diff --git a/vignettes/get_sentiments.Rmd b/vignettes/get_sentiments.Rmd deleted file mode 100644 index 6e74ef53..00000000 --- a/vignettes/get_sentiments.Rmd +++ /dev/null @@ -1,122 +0,0 @@ ---- -title: "Tagging Sentiment with Flair Standard Models" -author: - - name: "David (Yen-Chieh) Liao" - affiliation: "Postdoc at Text & Policy Research Group and SPIRe in UCD" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Tagging Sentiment with Flair Standard Models} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -```{r, include = FALSE} -system(paste(Sys.which("python3"), "-m pip install --upgrade pip")) -system(paste(Sys.which("python3"), "-m pip install torch")) -system(paste(Sys.which("python3"), "-m pip install flair")) -library(reticulate) -library(flaiR) -# reticulate::py_install("flair") -``` - - -## An Example Using `sentiment` Model (Pre-trained English Model) -```{r} -library(flaiR) -data("uk_immigration") -uk_immigration <- head(uk_immigration, 5) -``` - -
- -Download the English sentiment model from FlairNLP on [Hugging Face](https://huggingface.co/flair). Currently, it also supports a large English sentiment model and a German pre-trained model. - -```{r} -tagger_sent <- load_tagger_sentiments("sentiment") -``` - -Flair NLP operates under the [PyTorch](https://pytorch.org) framework. As such, we can use the `$to` method to set the device for the Flair Python library. The flair_device("cpu") allows you to select whether to use the CPU, CUDA devices (like cuda:0, cuda:1, cuda:2), or specific MPS devices on Mac (such as mps:0, mps:1, mps:2). For information on Accelerated PyTorch training on Mac, please refer to https://developer.apple.com/metal/pytorch/. For more about CUDA, please visit: https://developer.nvidia.com/cuda-zone - -```{r eval=FALSE, include=TRUE} -tagger_sent$to(flair_device("mps")) -``` - -``` -TextClassifier( - (embeddings): TransformerDocumentEmbeddings( - (model): DistilBertModel( - (embeddings): Embeddings( - (word_embeddings): Embedding(30522, 768, padding_idx=0) - (position_embeddings): Embedding(512, 768) - (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) - (dropout): Dropout(p=0.1, inplace=False) - ) - (transformer): Transformer( - (layer): ModuleList( - (0-5): 6 x TransformerBlock( - (attention): MultiHeadSelfAttention( - (dropout): Dropout(p=0.1, inplace=False) - (q_lin): Linear(in_features=768, out_features=768, bias=True) - (k_lin): Linear(in_features=768, out_features=768, bias=True) - (v_lin): Linear(in_features=768, out_features=768, bias=True) - (out_lin): Linear(in_features=768, out_features=768, bias=True) - ) - (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) - (ffn): FFN( - (dropout): Dropout(p=0.1, inplace=False) - (lin1): Linear(in_features=768, out_features=3072, bias=True) - (lin2): Linear(in_features=3072, out_features=768, bias=True) - (activation): GELUActivation() - ) - (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) - ) - ) - ) - ) - ) - (decoder): Linear(in_features=768, out_features=2, bias=True) - (dropout): Dropout(p=0.0, inplace=False) - (locked_dropout): LockedDropout(p=0.0) - (word_dropout): WordDropout(p=0.0) - (loss_function): CrossEntropyLoss() -) -``` -```{r } -results <- get_sentiments(uk_immigration$text, seq_len(nrow(uk_immigration)), - tagger_sent) -``` - -```{r} -print(results) -``` - -
- -## Batch Processing in English Sentiment Model -
- -Processing texts individually can be both inefficient and memory-intensive. On the other hand, processing all the texts simultaneously could surpass memory constraints, especially if each document in the dataset is sizable. Parsing the documents in smaller batches may provide an optimal compromise between these two scenarios. Batch processing can enhance efficiency and aid in memory management. - -By default, the batch_size parameter is set to 5. You can consider starting with this default value and then experimenting with different batch sizes to find the one that works best for your specific use case. You can monitor memory usage and processing time to help you make a decision. If you have access to a GPU, you might also try larger batch sizes to take advantage of GPU parallelism. However, be cautious not to set the batch size too large, as it can lead to out-of-memory errors. Ultimately, the choice of batch size should be based on a balance between memory constraints, processing efficiency, and the specific requirements of your entity extraction task. - -
-```{r} -batch_process_results <- get_sentiments_batch(uk_immigration$text, - uk_immigration$speaker, - tagger_sent, - show.text_id = FALSE, - batch_size = 2, - verbose = TRUE) -``` - -```{r} -print(batch_process_results) -``` - diff --git a/vignettes/tutorial.Rmd b/vignettes/tutorial.Rmd index 6fc47791..611a9b16 100644 --- a/vignettes/tutorial.Rmd +++ b/vignettes/tutorial.Rmd @@ -1,7 +1,7 @@ --- title: "Tutorial" author: - - name: "Yen-Chieh (David) Liao | Sohini Timbadia | Stefan Müller" + - name: "Yen-Chieh Liao | Sohini Timbadia | Stefan Müller" affiliation: "University of Birmingham & University College Dublin" output: rmarkdown::html_vignette vignette: > @@ -29,15 +29,20 @@ library(reticulate) reticulate::py_install("flair") ``` -# The Overview + +# Flair NLP and flaiR for Social Science
-**Flair NLP** is an open-source library for Natural Language Processing (NLP) developed by [Zalando Research](https://github.com/zalandoresearch/). Known for its state-of-the-art solutions, such as contextual string embeddings for NLP tasks like Named Entity Recognition (NER), Part-of-Speech tagging (POS), and more, it has garnered the attention of the NLP community for its ease of use and powerful functionalities. +Flair NLP is an open-source Natural Language Processing (NLP) library developed by [Zalando Research](https://github.com/zalandoresearch/). Known for its state-of-the-art solutions, it excels in contextual string embeddings, Named Entity Recognition (NER), and Part-of-Speech tagging (POS). Flair offers robust text analysis tools through multiple embedding approaches, including Flair contextual string embeddings, transformer-based embeddings from Hugging Face, and traditional models like GloVe and fasttext. Additionally, it provides pre-trained models for various languages and seamless integration with fine-tuned transformers hosted on Hugging Face. + +flaiR bridges these powerful NLP features from Python to the R environment, making advanced text analysis accessible for social science researcher by combining Flair's ease of use with R's familiar interface for integration with popular R packages such as [quanteda](https://quanteda.io) and more. + +
-In addition, Flair NLP offers pre-trained models for various languages and tasks, and is compatible with fine-tuned transformers hosted on Hugging Face. +# The Overview -- [**Sentence and Token Object**](#tutorial.html#sentence-and-token) +- [**Sentence and Token Object in FlaiR**](#tutorial.html#sentence-and-token) - [**Sequence Taggings**](#tutorial.html#sequence-taggings) @@ -51,21 +56,25 @@ In addition, Flair NLP offers pre-trained models for various languages and tasks - [**Finetune BERT**](#tutorial.html#finetune-transformers) - +- [**Extending conText's Embedding Regression**](#tutorial.html#extending-contexts-embedding-regression) + +  + +----- ------------------------------------------------------------------------- # Sentence and Token Sentence and Token are fundamental classes. -## **Sentence** +## Sentence
A Sentence in Flair is an object that contains a sequence of Token objects, and it can be annotated with labels, such as named entities, part-of-speech tags, and more. It also can store embeddings for the sentence as a whole and different kinds of linguistic annotations. Here's a simple example of how you create a Sentence: +
```{r} # Creating a Sentence object @@ -81,9 +90,8 @@ sentence <- Sentence(string) print(sentence) ``` - -## **Token** +## Token
@@ -91,6 +99,8 @@ When you use Flair to handle text data,[^1] `Sentence` and `Token` objects often Unlike R, which indexes from 1, Python indexes from 0. Therefore, when using a for loop, I use `seq_along(sentence) - 1`. The output should be something like: +
+ ```{r} # The Sentence object has automatically created and contains multiple Token objects # We can iterate through the Sentence object to view each Token @@ -109,8 +119,13 @@ print(sentence$tokens) **Retrieve the Token** +
+ To comprehend the string representation format of the Sentence object, tagging at least one token is adequate. Python's `get_token(n)` method allows us to retrieve the Token object for a particular token. Additionally, we can use **`[]`** to index a specific token. +
+ + ```{r} # method in Python sentence$get_token(5) @@ -121,16 +136,24 @@ sentence$get_token(5) sentence[6] ``` -Each word (and punctuation) in the text is treated as an individual Token object. These Token objects store text information and other possible linguistic information (such as part-of-speech tags or named entity tags) and embeddings (if you used a model to generate them). +
+ +Each word (and punctuation) in the text is treated as an individual Token object. These Token objects store text information and other possible linguistic information (such as part-of-speech tags or named entity tags) and embedding (if you used a model to generate them). While you do not need to create Token objects manually, understanding how to manage them is useful in situations where you might want to fine-tune the tokenization process. For example, you can control the exactness of tokenization by manually creating Token objects from a Sentence object. This makes Flair very flexible when handling text data since the automatic tokenization feature can be used for rapid development, while also allowing users to fine-tune their tokenization. +
+ **Annotate POS tag and NER tag** +
+ The `add_label(label_type, value)` method can be employed to assign a label to the token. In Universal POS tags, if `sentence[10]` is 'see', 'seen' might be tagged as `VERB`, indicating it is a past participle form of a verb. +
+ ```{r} sentence[10]$add_label('manual-pos', 'VERB') ``` @@ -139,8 +162,12 @@ sentence[10]$add_label('manual-pos', 'VERB') print(sentence[10]) ``` +
+ We can also add a NER (Named Entity Recognition) tag to `sentence[4]`, "UCD", identifying it as a university in Dublin. +
+ ```{r} sentence[4]$add_label('ner', 'ORG') ``` @@ -149,19 +176,20 @@ sentence[4]$add_label('ner', 'ORG') print(sentence[4]) ``` +
+ If we print the sentence object, `Sentence[50]` provides information for 50 tokens → ['in'/ORG, 'seen'/VERB], thus displaying two tagging pieces of information. +
+ ```{r} print(sentence) ``` [^1]: Flair is built on PyTorch, which is a library in Python. - - -## **Corpus** -
+## Corpus The Corpus object in Flair is a fundamental data structure that represents a dataset containing text samples, usually comprising of a training set, a development set (or validation set), and a test set. It's designed to work smoothly with Flair's models for tasks like named entity recognition, text classification, and more. @@ -173,9 +201,9 @@ The Corpus object in Flair is a fundamental data structure that represents a dat **Important Methods:** -- `downsample`: This method allows you to downsample (reduce) the number of sentences in the train, dev, and test splits. -- `obtain_statistics`: This method gives a quick overview of the statistics of the corpus, including the number of sentences and the distribution of labels. -- `make_vocab_dictionary`: Used to create a vocabulary dictionary from the corpus. +- `downsample`: This method allows you to downsample (reduce) the number of sentences in the train, dev, and test splits. +- `obtain_statistics`: This method gives a quick overview of the statistics of the corpus, including the number of sentences and the distribution of labels. +- `make_vocab_dictionary`: Used to create a vocabulary dictionary from the corpus. ```{r} library(flaiR) @@ -193,8 +221,12 @@ test <- list(Sentence('This is a test example.')) corp <- Corpus(train = train, dev = dev, test = test) ``` +
+ `$obtain_statistics()` method of the Corpus object in the Flair library provides an overview of the dataset statistics. The method returns a [Python dictionary](https://www.w3schools.com/python/python_dictionaries.asp) with details about the training, validation (development), and test datasets that make up the corpus. In R, you can use the jsonlite package to format JSON. +
+ ```{r} library(jsonlite) data <- fromJSON(corp$obtain_statistics()) @@ -204,10 +236,14 @@ print(formatted_str) **In R** +
+ Below, we use data from the article [*The Temporal Focus of Campaign Communication*](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwjz_bS3p5KCAxWEWEEAHcuVAi4QFnoECA8QAQ&url=https%3A%2F%2Fwww.journals.uchicago.edu%2Fdoi%2Ffull%2F10.1086%2F715165&usg=AOvVaw3f_J3sXTrym2ZR64pF3ZtN&opi=89978449) by [Stefan Muller](https://muellerstefan.net), published in the *Journal of Politics* in 2020, as an example. First, we vectorize the `cc_muller$text` using the Sentence function to transform it into a list object. Then, we reformat `cc_muller$class_pro_retro` as a factor. It's essential to note that R handles numerical values differently than Python. In R, numerical values are represented with a floating point, so it's advisable to convert them into factors or strings. Lastly, we employ the map function from the purrr package to assign labels to each sentence corpus using the `$add_label` method. +
+ ```{r} library(purrr) data(cc_muller) @@ -229,10 +265,14 @@ test <- text[!sample] sprintf("Corpus object sizes - Train: %d | Test: %d", length(train), length(test)) ``` +
+ If you don't provide a dev set, flaiR will not force you to carve out a portion of your test set to serve as a dev set. However, in some cases when only the train and test sets are provided without a dev set, flaiR might automatically take a fraction of the train set (e.g., 10%) to use as a dev set ([#2259](https://github.com/flairNLP/flair/issues/2259#issuecomment-830040253)). This is to offer a mechanism for model selection and to prevent the model from overfitting on the train set. In the "Corpus" function, there is a random selection of the "dev" dataset. To ensure reproducibility, we need to set the seed in the flaiR framework. We can accomplish this by calling the top-level module "flair" from {flaiR} and using `$set_seed(1964L)` to set the seed. +
+ ```{r} flair <- import_flair() flair$set_seed(1964L) @@ -248,20 +288,28 @@ sprintf("Corpus object sizes - Train: %d | Test: %d | Dev: %d", length(corp$dev)) ``` +
+ In the later sections, there will be more similar processing using the `Corpus`. Following that, we will focus on advanced NLP applications.
------------------------------------------------------------------------- +  + +----- + # Sequence Taggings -## **Tag Entities in Text** +## Tag Entities in Text
Let's run named entity recognition over the following example sentence: "I love Berlin and New York". To do this, all you need to do is make a Sentence object for this text, load a pre-trained model and use it to predict tags for the object. +
+ + ```{r} # attach flaiR in R library(flaiR) @@ -286,21 +334,27 @@ To print all annotations: print(sentence) ``` +
+ Use a for loop to print out each POS tag. It's important to note that Python is indexed from 0. Therefore, in an R environment, we must use `seq_along(sentence$get_labels()) - 1`. +
+ + ```{r} for (i in seq_along(sentence$get_labels())) { print(sentence$get_labels()[[i]]) } ``` -
-## **Tag Part-of-Speech in Text** +## Tag Part-of-Speech
-We use flaiR/POS-english for POS tagging in the standard models on Hugging Face. +We use `flaiR/POS-english` for POS tagging in the standard models on Hugging Face. + +
```{r} # attach flaiR in R @@ -314,12 +368,49 @@ sentence <- Sentence('I love Berlin and New York.') Classifier <- flair_nn()$Classifier tagger <- Classifier$load('pos') +``` + +_Penn Treebank POS Tags Reference_ + + +| Tag | Description | Example | +|-------|-------------|---------| +| DT | Determiner | the, a, these | +| NN | Noun, singular | cat, tree | +| NNS | Noun, plural | cats, trees | +| NNP | Proper noun, singular | John, London | +| NNPS | Proper noun, plural | Americans | +| VB | Verb, base form | take | +| VBD | Verb, past tense | took | +| VBG | Verb, gerund/present participle | taking | +| VBN | Verb, past participle | taken | +| VBP | Verb, non-3rd person singular present | take | +| VBZ | Verb, 3rd person singular present | takes | +| JJ | Adjective | big | +| RB | Adverb | quickly | +| O | Other | - | +| , | Comma | , | +| . | Period | . | +| : | Colon | : | +| -LRB- | Left bracket | ( | +| -RRB- | Right bracket | ) | +| `` | Opening quotation | " | +| '' | Closing quotation | " | +| HYPH | Hyphen | - | +| CD | Cardinal number | 1, 2, 3 | +| IN | Preposition | in, on, at | +| PRP | Personal pronoun | I, you, he | +| PRP$ | Possessive pronoun | my, your | +| UH | Interjection | oh, wow | +| FW | Foreign word | café | +| SYM | Symbol | +, % | + + +```{r} # run NER over sentence tagger$predict(sentence) ``` -To print all annotations: - ```{r} # print the sentence with all annotations print(sentence) @@ -333,15 +424,15 @@ for (i in seq_along(sentence$get_labels())) { } ``` - - -## **Detect Sentiment** +## Detect Sentiment
Let's run sentiment analysis over the same sentence to determine whether it is POSITIVE or NEGATIVE. -You can do this with essentially the same code as above. Instead of loading the 'ner' model, you now load the 'sentiment' model: +You can do this with essentially the same code as above. Instead of loading the 'ner' model, you now load the `'sentiment'` model: + +
```{r} # attach flaiR in R @@ -364,199 +455,138 @@ tagger$predict(sentence) print(sentence) ``` - ------------------------------------------------------------------------ +## Dealing with Dataframe - -### **Tagging Parts-of-Speech with Flair Models** +### Parts-of-Speech Tagging Across Full DataFrame
-You can load the pre-trained model `"pos-fast"`. For more pre-trained models, see . - -```{r} -texts <- c("UCD is one of the best universities in Ireland.", - "UCD has a good campus but is very far from my apartment in Dublin.", - "Essex is famous for social science research.", - "Essex is not in the Russell Group, but it is famous for political science research and in 1994 Group.", - "TCD is the oldest university in Ireland.", - "TCD is similar to Oxford.") +You can apply Part-of-Speech (POS) tagging across an entire DataFrame using Flair's pre-trained models. Let's walk through an example using the pos-fast model. You can apply Part-of-Speech (POS) tagging across an entire DataFrame using Flair's pre-trained models. Let's walk through an example using the pos-fast model. +First, let's load our required packages and sample data: -doc_ids <- c("doc1", "doc2", "doc3", "doc4", "doc5", "doc6") -``` +
```{r} library(flaiR) +data(uk_immigration) +uk_immigration <- uk_immigration[1:2,] ``` -```{r} -tagger_pos <- load_tagger_pos("pos-fast") -``` +
-```{r} -results <- get_pos(texts, doc_ids, tagger_pos) -head(results, n = 10) -``` +For POS tagging, we'll use Flair's pre-trained model. The pos-fast model offers a good balance between speed and accuracy. For more pre-trained models, check out Flair's documentation at Flair POS Tagging Documentation. There are two ways to load the POS tagger:
-### **Tagging Entities with Flair Models** - -
- -Load the pre-trained model `ner`. For more pre-trained models, see . +- Load with tag dictionary display (default): -```{r} -library(flaiR) -``` -```{r} -tagger_ner <- load_tagger_ner("ner") -``` ```{r} -results <- get_entities(texts, doc_ids, tagger_ner) -head(results, n = 10) +tagger_pos <- load_tagger_pos("pos-fast") ``` -
- - -### **Tagging Sentiment** +This will show you all available POS tags grouped by categories (nouns, verbs, adjectives, etc.). -
- -Load the pre-trained model "`sentiment`". The pre-trained models of "`sentiment`", "`sentiment-fast`", and "`de-offensive-language`" are currently available. For more pre-trained models, see . +- Load without tag display for a cleaner output: ```{r} -library(flaiR) +pos_tagger <- load_tagger_pos("pos-fast", show_tags = FALSE) ``` -```{r} -tagger_sent <- load_tagger_sentiments("sentiment") -``` +Now we can process our texts: ```{r} -results <- get_sentiments(texts, doc_ids, tagger_sent) +results <- get_pos(texts = uk_immigration$text, + doc_ids = uk_immigration$speaker, + show.text_id = TRUE, + tagger = pos_tagger) + head(results, n = 10) ``` -
- ------------------------------------------------------------------------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### Tagging Entities Across Full DataFrame - - - - - - - +
- +This section focuses on performing Named Entity Recognition (NER) on data stored in a dataframe format. My goal is to identify and tag named entities within text that is organized in a structured dataframe. - +I load the flaiR package and use the built-in uk_immigration dataset. For demonstration purposes, I'm only taking the first two rows. This dataset contains discussions about immigration in the UK. - +Load the pre-trained model `ner`. For more pre-trained models, see . - +
- - +```{r} +library(flaiR) +data(uk_immigration) +uk_immigration <- head(uk_immigration, n = 2) +``` - +
- +Next, I load the latest model hosted and maintained on Hugging Face by the Flair NLP team. For more Flair NER models, you can visit the official Flair NLP page on Hugging Face (https://huggingface.co/flair). - +
- +```{r} +# Load model without displaying tags +# tagger <- load_tagger_ner("flair/ner-english-large", show_tags = FALSE) - +library(flaiR) +tagger_ner <- load_tagger_ner("flair/ner-english-ontonotes") +``` - +
- +I load a pre-trained NER model. Since I'm using a Mac M1/M2, I set the model to run on the MPS device for faster processing. If I want to use other pre-trained models, I can check the Flair documentation website for available options. - +
- +Now I'm ready to process the text: - +```{r} +results <- get_entities(texts = uk_immigration$text, + doc_ids = uk_immigration$speaker, + tagger = tagger_ner, + batch_size = 2, + verbose = FALSE) +head(results, n = 10) +``` - - +  - +----- - +# Embedding - +
- +Flair is a very popular natural language processing library, providing a variety of embedding methods for text representation. Flair Embeddings is a word embedding framework developed by [Zalando](https://engineering.zalando.com/posts/2018/11/zalando-research-releases-flair.html). It focuses on word-level representation and can capture contextual information of words, allowing the same word to have different embeddings in different contexts. Unlike traditional word embeddings (such as Word2Vec or GloVe), Flair can dynamically generate word embeddings based on context and has achieved excellent results in various NLP tasks. Below are some key points about Flair Embeddings: - +
- +**Context-Aware** - +
- +Flair is a dynamic word embedding technique that can understand the meaning of words based on context. In contrast, static word embeddings, such as Word2Vec or GloVe, provide a fixed embedding for each word without considering its context in a sentence. - +Therefore, context-sensitive embedding techniques, such as Flair, can capture the meaning of words in specific sentences more accurately, thus enhancing the performance of language models in various tasks. +
------------------------------------------------------------------------- -# Flair Embedding +__Example:__
-Flair is a very popular natural language processing library, providing a variety of embedding methods for text representation. Flair Embeddings is a word embedding framework developed by [Zalando](https://engineering.zalando.com/posts/2018/11/zalando-research-releases-flair.html). It focuses on word-level representation and can capture contextual information of words, allowing the same word to have different embeddings in different contexts. Unlike traditional word embeddings (such as Word2Vec or GloVe), Flair can dynamically generate word embeddings based on context and has achieved excellent results in various NLP tasks. Below are some key points about Flair Embeddings: - -**Context-Aware** - -Flair is a dynamic word embedding technique that can understand the meaning of words based on context. In contrast, static word embeddings, such as Word2Vec or GloVe, provide a fixed embedding for each word without considering its context in a sentence. - -Therefore, context-sensitive embedding techniques, such as Flair, can capture the meaning of words in specific sentences more accurately, thus enhancing the performance of language models in various tasks. - -Example: - Consider the following two English sentences: - "I am interested in the bank of the river." @@ -566,12 +596,13 @@ Here, the word "bank" has two different meanings. In the first sentence, it refe For static embeddings, the word "bank" might have an embedding that lies somewhere between these two meanings because it doesn't consider context. But for dynamic embeddings like Flair, "bank" in the first sentence will have an embedding related to rivers, and in the second sentence, it will have an embedding related to finance. -```{r} -FlairEmbeddings <- flair_embeddings()$FlairEmbeddings -Sentence <- flair_data()$Sentence +
+```{r} # Initialize Flair embeddings +FlairEmbeddings <- flair_embeddings()$FlairEmbeddings +Sentence <- flair_data()$Sentence flair_embedding_forward <- FlairEmbeddings('news-forward') # Define the two sentences @@ -588,10 +619,14 @@ bank_embedding_sentence1 = sentence1[5]$embedding # "bank" is the seventh word bank_embedding_sentence2 = sentence2[6]$embedding # "bank" is the sixth word ``` +
+ Same word, similar vector representation, but essentially different. In this way, you can see how the dynamic embeddings for "bank" in the two sentences differ based on context. Although we printed the embeddings here, in reality, they would be high-dimensional vectors, so you might see a lot of numbers. If you want a more intuitive view of the differences, you could compute the cosine similarity or other metrics between the two embeddings. This is just a simple demonstration. In practice, you can also combine multiple embedding techniques, such as `WordEmbeddings` and `FlairEmbeddings`, to get richer word vectors. +
+ ```{r} library(lsa) cosine(as.numeric( bank_embedding_sentence1$numpy()), @@ -600,10 +635,16 @@ cosine(as.numeric( bank_embedding_sentence1$numpy()), **Character-Based** +
+ Flair uses a character-level language model, meaning it can generate embeddings for rare words or even misspelled words. This is an important feature because it allows the model to understand and process words that have never appeared in the training data. Flair uses a bidirectional LSTM (Long Short-Term Memory) network that operates at a character level. This allows it to feed individual characters into the LSTM instead of words. +
+ **Multilingual Support** +
+ Flair provides various pre-trained character-level language models, supporting contextual word embeddings for multiple languages. It allows you to easily combine different word embeddings (e.g., Flair Embeddings, Word2Vec, GloVe, etc.) to create powerful stacked embeddings.
@@ -617,6 +658,9 @@ In Flair, the simplest form of embeddings that still contains semantic informati Let's retrieve a few word embeddings and use FastText embeddings with the following code. To do so, we simply instantiate a WordEmbeddings class by passing in the ID of the embedding of our choice. Then, we simply wrap our text into a Sentence object, and call the `embed(sentence)` method on our WordEmbeddings class. + + + ```{r} WordEmbeddings <- flair_embeddings()$WordEmbeddings Sentence <- flair_data()$Sentence @@ -630,8 +674,12 @@ for (i in seq_along(sentence$tokens)) { ``` +
+ Flair supports a range of classic word embeddings, each offering unique features and application scopes. Below is an overview, detailing the ID required to load each embedding and its corresponding language. +
+ | Embedding Type | ID | Language | |-----------------------------|---------|------------| | GloVe | glove | English | @@ -671,9 +719,7 @@ Flair supports a range of classic word embeddings, each offering unique features | FastText (news & Wikipedia) | tr | Turkish | | FastText (news & Wikipedia) | zh | Chinese | - ------------------------------------------------------------------------- ## Contexual Embeddings @@ -683,6 +729,9 @@ The idea behind contextual string embeddings is that each word embedding should Because we are using a forward model, it only takes into account the context that occurs before a word. Additionally, since our word has no context on the left-hand side of its position in the sentence, the two embeddings are identical, and the code assumes they are identical, indeed output is **True**. + + + ```{r} FlairEmbeddings <- flair_embeddings()$FlairEmbeddings embedding <- FlairEmbeddings('news-forward') @@ -694,8 +743,13 @@ embedding$embed(s2) cat(" s1 sentence:", paste(s1[0], sep = ""), "\n", "s2 sentence:", paste(s2[0], sep = "")) ``` +
+ We test whether the sum of the two 2048 embeddings of `nice` is equal to 2048. If it is true, it indicates that the embedding results are consistent, which should theoretically be the case. +
+ + ```{r} length(s1[0]$embedding$numpy()) == sum(s1[0]$embedding$numpy() == s2[0]$embedding$numpy()) ``` @@ -716,10 +770,14 @@ The two sets of embeddings are not identical because the words are different, so length(s1[0]$embedding$numpy()) == sum(s1[0]$embedding$numpy() == s2[0]$embedding$numpy()) ``` +
+ The measure of similarity between two vectors in an inner product space is known as cosine similarity. The formula for calculating cosine similarity between two vectors, such as vectors A and B, is as follows: $Cosine Similarity = \frac{\sum_{i} (A_i \cdot B_i)}{\sqrt{\sum_{i} (A_i^2)} \cdot \sqrt{\sum_{i} (B_i^2)}}$ +
+ ```{r} library(lsa) vector1 <- as.numeric(s1[0]$embedding$numpy()) @@ -733,8 +791,6 @@ cosine_similarity <- cosine(vector1, vector2) print(cosine_similarity) ``` - - ------------------------------------------------------------------------ @@ -742,7 +798,9 @@ print(cosine_similarity)
-First, we utilize the `flair.embeddings.TransformerWordEmbeddings` function to download BERT, and more transformer models can also be found on [Flair NLP's Hugging Face](https://huggingface.co/flair). +First, we utilize the `TransformerWordEmbeddings` function to download BERT, and more transformer models can also be found on [Flair NLP's Hugging Face](https://huggingface.co/flair). + +
```{r} library(flaiR) @@ -767,79 +825,282 @@ for (i in seq_along(sentence$tokens)) { } ``` - ------------------------------------------------------------------------ -# Training a Binary Classifier +## Visialized Embeddings -
-In this section, we'll train a sentiment analysis model that can categorize text as either positive or negative. This case study is adapted from pages 116 to 130 of Tadej Magajna's book, '[Natural Language Processing with Flair](https://www.packtpub.com/product/natural-language-processing-with-flair/9781801072311)'. The process for training text classifiers in Flair mirrors the process followed for sequence labeling models. Specifically, the steps to train text classifiers are: +### Word Embeddings (GloVe) -- Load a tagged corpus and compute the label dictionary map. -- Prepare the document embeddings. -- Initialize the `TextClassifier` class. -- Train the model. +- GloVe embeddings are Pytorch vectors of dimensionality 100. -
+- For English, Flair provides a few more options. Here, you can use `en-glove` and `en-extvec` with the __WordEmbeddings__ class. +```{r} +# Initialize Text Processing Tools --------------------------- +# Import Sentence class for text operations +Sentence <- flair_data()$Sentence -## Loading a Tagged Corpus +# Configure GloVe Embeddings -------------------------------- +# Load WordEmbeddings class and initialize GloVe model +WordEmbeddings <- flair_embeddings()$WordEmbeddings +embedding <- WordEmbeddings("glove") -
+# Text Processing and Embedding ----------------------------- +# Create sentence with semantic relationship pairs +sentence <- Sentence("King Queen man woman Paris London apple orange Taiwan Dublin Bamberg") -Training text classification models requires a set of text documents (typically, sentences or paragraphs) where each document is associated with one or more classification labels. To train our sentiment analysis text classification model, we will be using the famous Internet Movie Database (IMDb) dataset, which contains 50,000 movie reviews from IMDB, where each review is labeled as either positive or negative. References to this dataset are already baked into Flair, so loading the dataset couldn't be easier: +# Apply GloVe embeddings to the sentence +embedding$embed(sentence) -```{r} -library(flaiR) -# load IMDB from flair_datasets module -Corpus <- flair_data()$Corpus -IMDB <- flair_datasets()$IMDB -``` +# Extract embeddings into matrix format +sen_df <- process_embeddings(sentence, + verbose = TRUE) -```{r} -# downsize to 0.05 -corpus = IMDB() -corpus$downsample(0.05) -``` +# Dimensionality Reduction --------------------------------- +# Set random seed for reproducibility +set.seed(123) -Print the sizes in the corpus object as follows - test: %d \| train: %d \| dev: %d" +# Apply PCA to reduce dimensions to 3 components +pca_result <- prcomp(sen_df, center = TRUE, scale. = TRUE) -```{r} -test_size <- length(corpus$test) -train_size <- length(corpus$train) -dev_size <- length(corpus$dev) -output <- sprintf("Corpus object sizes - Test: %d | Train: %d | Dev: %d", test_size, train_size, dev_size) -print(output) +# Extract first three principal components +word_embeddings_matrix <- as.data.frame(pca_result$x[,1:3]) +word_embeddings_matrix ``` -```{r} -lbl_type = 'sentiment' -label_dict = corpus$make_label_dictionary(label_type=lbl_type) +#### 2D Plot + +```{r, out.width="95%" } +library(ggplot2) +glove_plot2D <- ggplot(word_embeddings_matrix, aes(x = PC1, y = PC2, color = PC3, + label = rownames(word_embeddings_matrix))) + + geom_point(size = 3) + + geom_text(vjust = 1.5, hjust = 0.5) + + scale_color_gradient(low = "blue", high = "red") + + theme_minimal() + + labs(title = "", x = "PC1", y = "PC2", color = "PC3") + # guides(color = "none") +glove_plot2D ``` -
+#### 3D Plot -## Loading the Embeddings +[plotly](https://plotly.com/r/) in R API: https://plotly.com/r/ -
+```{r, message = FALSE, warning = FALSE, out.width="95%"} +library(plotly) +glove_plot3D <- plot_ly(data = word_embeddings_matrix, + x = ~PC1, y = ~PC2, z = ~PC3, + type = "scatter3d", mode = "markers", + marker = list(size = 5), + text = rownames(word_embeddings_matrix), hoverinfo = 'text') -flaiR covers all the different types of document embeddings that we can use. Here, we simply use `DocumentPoolEmbeddings`. They require no training prior to training the classification model itself: +glove_plot3D +``` + +### Stack Embeddings Method (GloVe + Back/forwad FlairEmbeddings or More) ```{r} -DocumentPoolEmbeddings <- flair_embeddings()$DocumentPoolEmbeddings +# Initialize Embeddings ----------------------------- +# Load embedding types from flaiR WordEmbeddings <- flair_embeddings()$WordEmbeddings -glove = WordEmbeddings('glove') -document_embeddings = DocumentPoolEmbeddings(glove) -``` +FlairEmbeddings <- flair_embeddings()$FlairEmbeddings +StackedEmbeddings <- flair_embeddings()$StackedEmbeddings -
+# Configure Embeddings ---------------------------- +# Initialize GloVe word embeddings +glove_embedding <- WordEmbeddings('glove') +# Initialize Flair contextual embeddings +flair_embedding_forward <- FlairEmbeddings('news-forward') +flair_embedding_backward <- FlairEmbeddings('news-backward') -## Initializing the TextClassifier +# Initialize GloVe for individual use +embedding <- WordEmbeddings("glove") + +# Create stacked embeddings combining GloVe and bidirectional Flair +stacked_embeddings <- StackedEmbeddings(c(glove_embedding, + flair_embedding_forward, + flair_embedding_backward)) + +# Text Processing -------------------------------- +# Load Sentence class from flaiR +Sentence <- flair_data()$Sentence + +# Create test sentence with semantic relationships +sentence <- Sentence("King Queen man woman Paris London apple orange Taiwan Dublin Bamberg") + +# Apply embeddings and extract features ---------- +# Embed text using stacked embeddings +stacked_embeddings$embed(sentence) + +# Extract embeddings matrix with processing details +sen_df <- process_embeddings(sentence, + verbose = TRUE) + +# Dimensionality Reduction ----------------------- +set.seed(123) + +# Perform PCA for visualization +pca_result <- prcomp(sen_df, center = TRUE, scale. = TRUE) + +# Extract first three principal components +word_embeddings_matrix <- as.data.frame(pca_result$x[,1:3]) +word_embeddings_matrix +``` + +```{r, out.width="95%" } +# 2D Plot +library(ggplot2) + +stacked_plot2D <- ggplot(word_embeddings_matrix, aes(x = PC1, y = PC2, color = PC3, + label = rownames(word_embeddings_matrix))) + + geom_point(size = 2) + + geom_text(vjust = 1.5, hjust = 0.5) + + scale_color_gradient(low = "blue", high = "red") + + theme_minimal() + + labs(title = "", x = "PC1", y = "PC2", color = "PC3") + +stacked_plot2D +``` + + +### Transformer Embeddings (BERT or More) + +```{r} +# Load Required Package ---------------------------- +library(flaiR) + +# Initialize BERT and Text Processing -------------- +# Import Sentence class for text operations +Sentence <- flair_data()$Sentence + +# Initialize BERT model (base uncased version) +TransformerWordEmbeddings <- flair_embeddings()$TransformerWordEmbeddings("bert-base-uncased") + +# Text Processing and Embedding -------------------- +# Create sentence with semantic relationship pairs +sentence <- Sentence("King Queen man woman Paris London apple orange Taiwan Dublin Bamberg") + +# Apply BERT embeddings to the sentence +TransformerWordEmbeddings$embed(sentence) + +# Extract embeddings into matrix format +sen_df <- process_embeddings(sentence, verbose = TRUE) + +# Dimensionality Reduction ------------------------ +# Set random seed for reproducibility +set.seed(123) + +# Apply PCA to reduce dimensions to 3 components +pca_result <- prcomp(sen_df, center = TRUE, scale. = TRUE) + +# Extract first three principal components +word_embeddings_matrix <- as.data.frame(pca_result$x[,1:3]) +word_embeddings_matrix +``` + +```{r, out.width="95%" } +# 2D Plot +library(ggplot2) + +bert_plot2D <- ggplot(word_embeddings_matrix, aes(x = PC1, y = PC2, color = PC3, + label = rownames(word_embeddings_matrix))) + + geom_point(size = 2) + + geom_text(vjust = 1.5, hjust = 0.5) + + scale_color_gradient(low = "blue", high = "red") + + theme_minimal() + + labs(title = "", x = "PC1", y = "PC2", color = "PC3") + # guides(color = "none") + +stacked_plot2D +``` + +### Embedding Models Comparison +```{r, out.width="95%" } +library(ggpubr) + +figure <- ggarrange(glove_plot2D, stacked_plot2D, bert_plot2D, + labels = c("Glove", "Stacked Embedding", "BERT"), + ncol = 3, nrow = 1, + common.legend = TRUE, + legend = "bottom", + font.label = list(size = 8)) + +figure +``` + + +  + +----- + + +# Training a Binary Classifier + +In this section, we'll train a sentiment analysis model that can categorize text as either positive or negative. This case study is adapted from pages 116 to 130 of Tadej Magajna's book, '[Natural Language Processing with Flair](https://www.packtpub.com/product/natural-language-processing-with-flair/9781801072311)'. The process for training text classifiers in Flair mirrors the process followed for sequence labeling models. Specifically, the steps to train text classifiers are: + +- Load a tagged corpus and compute the label dictionary map. +- Prepare the document embeddings. +- Initialize the `TextClassifier` class. +- Train the model. + +## Loading a Tagged Corpus + +
+ +Training text classification models requires a set of text documents (typically, sentences or paragraphs) where each document is associated with one or more classification labels. To train our sentiment analysis text classification model, we will be using the famous Internet Movie Database (IMDb) dataset, which contains 50,000 movie reviews from IMDB, where each review is labeled as either positive or negative. References to this dataset are already baked into Flair, so loading the dataset couldn't be easier: + +
+ +```{r} +library(flaiR) +# load IMDB from flair_datasets module +Corpus <- flair_data()$Corpus +IMDB <- flair_datasets()$IMDB +``` + +```{r} +# downsize to 0.05 +corpus = IMDB() +corpus$downsample(0.05) +``` + +Print the sizes in the corpus object as follows - test: %d \| train: %d \| dev: %d" + +```{r} +test_size <- length(corpus$test) +train_size <- length(corpus$train) +dev_size <- length(corpus$dev) +output <- sprintf("Corpus object sizes - Test: %d | Train: %d | Dev: %d", test_size, train_size, dev_size) +print(output) +``` + +```{r} +lbl_type = 'sentiment' +label_dict = corpus$make_label_dictionary(label_type=lbl_type) +``` + + +## Loading the Embeddings
+ +flaiR covers all the different types of document embeddings that we can use. Here, we simply use `DocumentPoolEmbeddings`. They require no training prior to training the classification model itself: + +
+ +```{r} +DocumentPoolEmbeddings <- flair_embeddings()$DocumentPoolEmbeddings +WordEmbeddings <- flair_embeddings()$WordEmbeddings +glove = WordEmbeddings('glove') +document_embeddings = DocumentPoolEmbeddings(glove) +``` + +## Initializing the TextClassifier + ```{r} # initiate TextClassifier TextClassifier <- flair_models()$TextClassifier @@ -848,8 +1109,10 @@ classifier <- TextClassifier(document_embeddings, label_type = lbl_type) ``` + `$to` allows you to set the device to use CPU, GPU, or specific MPS devices on Mac (such as mps:0, mps:1, mps:2). + ```{r eval=FALSE, include=TRUE} classifier$to(flair_device("mps")) ``` @@ -872,20 +1135,24 @@ TextClassifier( (loss_function): CrossEntropyLoss() ) ``` - ## Training the Model -
Training the text classifier model involves two simple steps: -- Defining the model trainer class by passing in the classifier model and the corpus -- Setting off the training process passing in the required training hyper-parameters. +- Defining the model trainer class by passing in the classifier model and the corpus +- Setting off the training process passing in the required training hyper-parameters. + + +
**It is worth noting that the 'L' in numbers like 32L and 5L is used in R to denote that the number is an integer. Without the 'L' suffix, numbers in R are treated as numeric, which are by default double-precision floating-point numbers. In contrast, Python determines the type based on the value of the number itself. Whole numbers (e.g., 5 or 32) are of type int, while numbers with decimal points (e.g., 5.0) are of type float. Floating-point numbers in both languages are representations of real numbers but can have some approximation due to the way they are stored in memory.** +
+ + ```{r} # initiate ModelTrainer ModelTrainer <- flair_trainers()$ModelTrainer @@ -903,7 +1170,6 @@ trainer$train('classifier', max_epochs=10L) ``` -
## Loading and Using the Classifiers @@ -912,6 +1178,8 @@ trainer$train('classifier', After training the text classification model, the resulting classifier will already be stored in memory as part of the classifier variable. It is possible, however, that your Python session exited after training. If so, you'll need to load the model into memory with the following: + + ```{r} TextClassifier <- flair_models()$TextClassifier classifier <- TextClassifier$load('classifier/best-model.pt') @@ -935,22 +1203,22 @@ classifier$predict(sentence) print(sentence$labels) ``` - ------------------------------------------------------------------------- +  + +----- + # Training RNNs
Here, we train a sentiment analysis model to categorize text. In this case, we also include a pipeline that implements the use of Recurrent Neural Networks (RNN). This makes them particularly effective for tasks involving sequential data. This section also show you how to implement one of most powerful features in flaiR, stacked embeddings. You can stack multiple embeddings with different layers and let the classifier learn from different types of features. In Flair NLP, and with the **flaiR** package, it's very easy to accomplish this task. +
## Import Necessary Modules -
- - ```{r} library(flaiR) WordEmbeddings <- flair_embeddings()$WordEmbeddings @@ -960,7 +1228,6 @@ TextClassifier <- flair_models()$TextClassifier ModelTrainer <- flair_trainers()$ModelTrainer ``` -
## Get the IMDB Corpus @@ -968,6 +1235,9 @@ ModelTrainer <- flair_trainers()$ModelTrainer The IMDB movie review dataset is used here, which is a commonly utilized dataset for sentiment analysis. `$downsample(0.1)` method means only 10% of the dataset is used, allowing for a faster demonstration. + + + ```{r} # load the IMDB file and downsize it to 0.1 IMDB <- flair_datasets()$IMDB @@ -976,7 +1246,6 @@ corpus <- IMDB()$downsample(0.1) lbl_type <- 'sentiment' label_dict <- corpus$make_label_dictionary(label_type=lbl_type) ``` - ## Stacked Embeddings @@ -984,6 +1253,9 @@ label_dict <- corpus$make_label_dictionary(label_type=lbl_type) This is one of Flair's most powerful features: it allows for the integration of embeddings to enable the model to learn from more sparse features. Three types of embeddings are utilized here: GloVe embeddings, and two types of Flair embeddings (forward and backward). Word embeddings are used to convert words into vectors. + + + ```{r} # make a list of word embeddings word_embeddings <- list(WordEmbeddings('glove'), @@ -1005,7 +1277,6 @@ classifier <- TextClassifier(document_embeddings, # initialize the text classifier trainer with our corpus trainer <- ModelTrainer(classifier, corpus) ``` - ## Start the Training @@ -1017,6 +1288,9 @@ It is worth noting that the learning rate is a parameter that determines the ste `patience` (aka early stop) is a hyper-parameter used in conjunction with early stopping to avoid overfitting. It determines the number of epochs the training process will tolerate without improvements before stopping the training. Setting max_epochs to 5 means the algorithm will make five passes through the dataset. + + + ```{r eval=FALSE, include=TRUE} # note: the 'L' in 32L is used in R to denote that the number is an integer. trainer$train('models/sentiment', @@ -1025,38 +1299,32 @@ trainer$train('models/sentiment', patience=5L, max_epochs=5L) ``` - - ## To Apply the Trained Model for Prediction -
- ```{r eval=FALSE, include=TRUE} sentence <- "This movie was really exciting!" classifier$predict(sentence) print(sentence.labels) ``` -
------------------------------------------------------------------------ # Finetune Transformers
+ We use data from *The Temporal Focus of Campaign Communication (2020 JOP)* as an example. Let's assume we receive the data for training from different times. First, suppose you have a dataset of 1000 entries called `cc_muller_old`. On another day, with the help of nice friends, you receive another set of data, adding 2000 entries in a dataset called `cc_muller_new`. Both subsets are from `data(cc_muller)`. We will show how to fine-tune a transformer model with `cc_muller_old`, and then continue with another round of fine-tuning using `cc_muller_new`. +
+ ```{r} library(flaiR) ``` - - ## Fine-tuning a Transformers Model -
- **Step 1** Load Necessary Modules from Flair Load necessary classes from `flair` package. @@ -1078,8 +1346,13 @@ TextClassifier <- flair_models()$TextClassifier ModelTrainer <- flair_trainers()$ModelTrainer ``` +
+ We use purrr to help us split sentences using Sentence from `flair_data()`, then use map2 to add labels, and finally use `Corpus` to segment the data. +
+ + ```{r} library(purrr) @@ -1090,7 +1363,6 @@ old_text <- map(cc_muller_old$text, Sentence) old_labels <- as.character(cc_muller_old$class) old_text <- map2(old_text, old_labels, ~ { - .x$add_label("classification", .y) .x }) @@ -1111,8 +1383,12 @@ old_test <- old_test[test_id] old_dev <- old_test[!test_id] ``` +
+ If you do not provide a development set (dev set) while using Flair, it will automatically split the training data into training and development datasets. The test set is used for training the model and evaluating its final performance, whereas the development set is used for adjusting model parameters and preventing overfitting, or in other words, for early stopping of the model. +
+ ```{r} old_corpus <- Corpus(train = old_train, test = old_test) ``` @@ -1123,14 +1399,23 @@ old_corpus <- Corpus(train = old_train, test = old_test) document_embeddings <- TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=TRUE) ``` -First, the `$make_label_dictionary` function is used to automatically create a label dictionary for the classification task. The label dictionary is a mapping from label to index, which is used to map the labels to a tensor of label indices. Besides classification tasks, flaiR also supports other label types for training custom model, such as `ner`, `pos` and `sentiment`. From the cc_muller dataset: Future (seen 423 times), Present (seen 262 times), Past (seen 131 times) +
+ +First, the `$make_label_dictionary` function is used to automatically create a label dictionary for the classification task. The label dictionary is a mapping from label to index, which is used to map the labels to a tensor of label indices. Besides classification tasks, flaiR also supports other label types for training custom model. From the cc_muller dataset: Future (seen 423 times), Present (seen 262 times), Past (seen 131 times). + +
+ ```{r} old_label_dict <- old_corpus$make_label_dictionary(label_type="classification") ``` +
+ `TextClassifier` is used to create a text classifier. The classifier takes the document embeddings (importing from `'distilbert-base-uncased'` from Hugging Face) and the label dictionary as input. The label type is also specified as classification. +
+ ```{r} old_classifier <- TextClassifier(document_embeddings, label_dictionary = old_label_dict, @@ -1153,7 +1438,6 @@ old_trainer$train("vignettes/inst/muller-campaign-communication", save_final_model=TRUE, max_epochs=1L) ``` -
## Continue Fine-tuning with New Dataset @@ -1162,10 +1446,14 @@ old_trainer$train("vignettes/inst/muller-campaign-communication", Now, we can continue to fine tune the already fine tuned model with an additional 2000 pieces of data. First, let's say we have another 2000 entries called `cc_muller_new`. We can fine-tune the previous model with these 2000 entries. The steps are the same as before. For this case, we don't need to split the dataset again. We can use the entire 2000 entries as the training set and use the `old_test` set to evaluate how well our refined model performs. + + **Step 1** Load the `muller-campaign-communication` Model + Load the model (`old_model`) you have already fine tuned from previous stage and let's fine tune it with the new data, `new_corpus`. + ```{r} old_model <- TextClassifier$load("vignettes/inst/muller-campaign-communication/best-model.pt") ``` @@ -1203,5 +1491,596 @@ new_trainer$train("vignettes/inst/new-muller-campaign-communication", max_epochs=1L) ``` + +## Model Performance Metrics: Pre and Post Fine-tuning + + +After fine-tuning for 1 epoch, the model showed improved performance on the same test set. + + +| Evaluation Metric | Pre-finetune | Post-finetune | Improvement | +|-------------------|--------------|---------------|-------------| +| F-score (micro) | 0.7294 | 0.8471 | +0.1177 | +| F-score (macro) | 0.7689 | 0.8583 | +0.0894 | +| Accuracy | 0.7294 | 0.8471 | +0.1177 | + More R tutorial and documentation see [here](https://github.com/davidycliao/flaiR). + +## Using Your Own Fine-tuned Model in flaiR + +This seciton demonstrates how to utilize your custom fine-tuned model in flaiR for text classification tasks. Let's explore this process step by step. + +**Setting Up Your Environment** + +First, we need to load the flaiR package and prepare our model: + + +```{r} +library(flaiR) +classifier <- flair_models()$TextClassifier$load('vignettes/inst/new-muller-campaign-communication/best-model.pt') +``` + +It's important to verify your model's compatibility with `$model_card`. You can check this by examining the version requirements: + +```{r} +print(classifier$model_card) + +``` + +```{r} +# Check required versions +print(classifier$model_card$transformers_version) # Required transformers version +print(classifier$model_card$flair_version) # Required Flair version +``` + +**Making Predictions** + +To make predictions, we first need to prepare our text by creating a Sentence object. This is a key component in Flair's architecture that handles text processing: + +```{r} +# Get the Sentence class from flaiR +Sentence <- flair_data()$Sentence + +# Create a Sentence object with your text +sentence <- Sentence("And to boost the housing we need, we will start to build a new generation of garden cities.") + +# Make prediction +classifier$predict(sentence) + +# Access prediction results +prediction <- sentence$labels[[1]]$value # Get predicted label +confidence <- sentence$labels[[1]]$score # Get confidence score +``` + +  + +----- + + +# Extending conText's Embedding Regression + +
+ +`ConText` is a fast, flexible, and transparent framework for estimating context-specific word and short document embeddings using [the 'a la carte' embeddings regression](https://github.com/prodriguezsosa/EmbeddingRegression), implemented by Rodriguez et al. + +In this case study, I'll demonstrate how to use the conText package alongside other embedding frameworks by working through the example provided in Rodriguez et al.'s [Quick Start Guide](https://github.com/prodriguezsosa/conText/blob/master/vignettes/quickstart.md). While ConText includes its own cross-lingual ALC Embeddings, this tutorial extends its capabilities by integrating it with flaiR. Through this tutorial integration, this tutorial shows how to: + +
+ + +- Access flaiR's powerful embedding models + +- Connect with any transformer-based embedding models from HuggingFace via FlaiR + + +
+ +I'll be following the example directly from [Rodriguez et al.'s Quick Start Guide]() as this case study. It's important to note that results obtained using alternative embedding frameworks may deviate from the original implementation, and should be interpreted with caution. These comparative results are primarily intended for reference and educational use. + +First of all, when loading the conText package, you'll find three pre-loaded datasets: `cr_sample_corpus`, `cr_glove_subset`, and `cr_transform.` These datasets are used in the package's tutorial to demonstrate preprocessing steps. For this exercise, I'll use `cr_sample_corpus` to explore other embedding frameworks, including: + +
+ + +- `en-crawl` embedding +- Flair NLP contextual embeddings (as described in [Akbik et al., COLING 2018 paper](https://flairnlp.github.io/docs/tutorial-embeddings/flair-embeddings)) + +- Integrated embeddings extracted from transformers like BERT. + + +## Build Document-Embedding-Matrix with Other Embedding Frameworks + +```{r include=FALSE} +library(conText) +library(quanteda) +library(dplyr) +library(text2vec) +library(flaiR) +``` + +**Step 1** Tokenize Text with `quanteda` and `conText` + + +First, let's tokenize `cr_sample_corpus` using the tokens_context function from the conText package. + +```{r} +# tokenize corpus removing unnecessary (i.e. semantically uninformative) elements +toks <- tokens(cr_sample_corpus, remove_punct=T, remove_symbols=T, remove_numbers=T, remove_separators=T) + +# clean out stopwords and words with 2 or fewer characters +toks_nostop <- tokens_select(toks, pattern = stopwords("en"), selection = "remove", min_nchar=3) + +# only use features that appear at least 5 times in the corpus +feats <- dfm(toks_nostop, tolower=T, verbose = FALSE) %>% dfm_trim(min_termfreq = 5) %>% featnames() + +# leave the pads so that non-adjacent words will not become adjacent +toks_nostop_feats <- tokens_select(toks_nostop, feats, padding = TRUE) + +# build a tokenized corpus of contexts surrounding the target term "immigration" +immig_toks <- tokens_context(x = toks_nostop_feats, pattern = "immigr*", window = 6L) + +# build document-feature matrix +immig_dfm <- dfm(immig_toks) +``` + +**Step 2** Import Embedding Tools + +
+ +To facilitate the loading of different embedding types, I'll import the following __classes__ and __functions__ from flaiR: `WordEmbeddings`, `FlairEmbeddings`, `TransformerWordEmbeddings`, `StackedEmbeddings`, and `Sentence.` These components enable us to work with GloVe embeddings, Flair's contextual embeddings, and transformer-based embeddings from the HuggingFace library. + +
+ +```{r include=FALSE} +# Load the flaiR library +library(flaiR) + +# Import embedding classes from flair +WordEmbeddings <- flair_embeddings()$WordEmbeddings + +# Import Sentence class for text processing +Sentence <- flair_data()$Sentence + +# Initialize FastText embeddings trained on Common Crawl +fasttext_embeddings <- WordEmbeddings('en-news') +``` + +
+ +Initialize a Flair Sentence object by concatenating cr_glove_subset row names. The collapse parameter ensures proper tokenization by adding space delimiters. Then, embed the sentence text using the loaded fasttext embeddings. + +
+ +```{r} +# Combine all text into a single string and create a Flair sentence +sentence <- Sentence(paste(rownames(cr_glove_subset), collapse = " ")) + +# Apply FastText embeddings to the sentence +fasttext_embeddings$embed(sentence) +``` + +
+ +The `process_embeddings` function from flaiR extracts pre-embedded GloVe vectors from a sentence object and arranges them into a structured matrix. In this matrix, tokens are represented as rows, embedding dimensions as columns, and each row is labeled with its corresponding token text. + +
+ +```{r} +fasttext_subset <- process_embeddings(sentence, verbose = TRUE) +``` + +**Step 3** Computing Context-Specific Word Embeddings Using FastText + +
+ +Create a feature co-occurrence matrix (FCM) from tokenized text and transform pre-trained FastText embeddings using co-occurrence information. + +
+ +```{r} +# Create a feature co-occurrence matrix (FCM) from tokenized text +toks_fcm <- fcm(toks_nostop_feats, + context = "window", + window = 6, + count = "frequency", + tri = FALSE) + +# Transform pre-trained Glove embeddings using co-occurrence information +ft_transform <- compute_transform( + x = toks_fcm, + pre_trained = fasttext_subset, + weighting = 'log' +) +``` + +Calculate Document Embedding Matrix (DEM) using transformed FastText embeddings. + +```{r} +# Calculate Document Embedding Matrix (DEM) using transformed FastText embeddings +immig_dem_ft <- dem(x = immig_dfm, + pre_trained = fasttext_subset, + transform = TRUE, + transform_matrix = ft_transform, + verbose = TRUE) +``` + +Show each document inherits its corresponding docvars. + +```{r} +head(immig_dem_ft@docvars) +``` + + +**Step 4** Embedding Eegression + +```{r} +set.seed(2021L) +library(conText) +ft_model <- conText(formula = immigration ~ party + gender, + data = toks_nostop_feats, + pre_trained = fasttext_subset, + transform = TRUE, + transform_matrix = ft_transform, + confidence_level = 0.95, + permute = TRUE, + jackknife = TRUE, + num_permutations = 100, + window = 6, case_insensitive = TRUE, + verbose = FALSE) +``` + +extract D-dimensional beta coefficients. + +```{r} +# The intercept in this case is the fastext embedding for female Democrats +# beta coefficients can be combined to get each group's fastext embedding +DF_wv <- ft_model['(Intercept)',] # (D)emocrat - (F)emale +DM_wv <- ft_model['(Intercept)',] + ft_model['gender_M',] # (D)emocrat - (M)ale +RF_wv <- ft_model['(Intercept)',] + ft_model['party_R',] # (R)epublican - (F)emale +RM_wv <- ft_model['(Intercept)',] + ft_model['party_R',] + ft_model['gender_M',] # (R)epublican - (M)ale +``` + +nearest neighbors +```{r} +nns(rbind(DF_wv,DM_wv), + N = 10, + pre_trained = fasttext_subset, + candidates = ft_model@features) +``` + +```{r} +library(ggplot2) +ggplot(ft_model@normed_coefficients, aes(x = coefficient, y = normed.estimate)) + + geom_errorbar(aes(ymin = lower.ci, ymax = upper.ci), width = 0.2) + + geom_point(size = 3) + + geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") + + theme_minimal() + + labs( + title = "Estimated Coefficients with 95% CIs", + x = "Variables", + y = "Normalized Estimate" + ) + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + +``` + +## Exploring Document-Embedding Matrix with conText Functions + +Check dimensions of the resulting matrix. +```{r} +# Calculate average document embeddings for immigration-related texts +immig_wv_ft <- matrix(colMeans(immig_dem_ft), + ncol = ncol(immig_dem_ft)) %>% `rownames<-`("immigration") +dim(immig_wv_ft) +``` + +to get group-specific embeddings, average within party +```{r} +immig_wv_ft_party <- dem_group(immig_dem_ft, + groups = immig_dem_ft@docvars$party) +dim(immig_wv_ft_party) +``` + +Find nearest neighbors by party +```{r} +# find nearest neighbors by party +# setting as_list = FALSE combines each group's results into a single tibble (useful for joint plotting) +immig_nns_ft <- nns(immig_wv_ft_party, + pre_trained = fasttext_subset, + N = 5, + candidates = immig_wv_ft_party@features, + as_list = TRUE) +``` + +check out results for Republican. +```{r} +immig_nns_ft[["R"]] +``` + +check out results for Democrat +```{r} +immig_nns_ft[["D"]] +``` + +compute the cosine similarity between each party's embedding and a specific set of features +```{r} +cos_sim(immig_wv_ft_party, + pre_trained = fasttext_subset, + features = c('reform', 'enforcement'), as_list = FALSE) +``` + +compute the cosine similarity between each party's embedding and a specific set of features. + +```{r} +# Republican +nns_ratio(x = immig_wv_ft_party, + N = 15, + numerator = "R", + candidates = immig_wv_ft_party@features, + pre_trained = fasttext_subset, + verbose = FALSE) +``` + +```{r} +# Democrat +nns_ratio(x = immig_wv_ft_party, + N = 15, + numerator = "D", + candidates = immig_wv_ft_party@features, + pre_trained = fasttext_subset, + verbose = FALSE) +``` + +compute the cosine similarity between each party's embedding and a set of tokenized contexts +```{r} +immig_ncs <- ncs(x = immig_wv_ft_party, + contexts_dem = immig_dem_ft, + contexts = immig_toks, + N = 5, + as_list = TRUE) + +# nearest contexts to Republican embedding of target term +# note, these may included contexts originating from Democrat speakers +immig_ncs[["R"]] +``` + +```{r} +immig_ncs[["D"]] +``` + +## Comparative Analysis of A La Carte, Flair Stacked, and BERT Embeddings + +### Build A La Carte Document-Embedding-Matrix + +```{r} +# build a document-embedding-matrix +immig_dem <- dem(x = immig_dfm, pre_trained = cr_glove_subset, transform = TRUE, transform_matrix = cr_transform, verbose = TRUE) + +set.seed(2021L) +alc_model <- conText(formula = immigration ~ party + gender, + data = toks_nostop_feats, + pre_trained = cr_glove_subset, + transform = TRUE, + transform_matrix = cr_transform, + jackknife = TRUE, + confidence_level = 0.95, + permute = TRUE, + num_permutations = 100, + window = 6, + case_insensitive = TRUE, + verbose = FALSE) +``` + +### Document Embedding Matrix Construction Using Flair Contextual Stacked Embeddings + +
+ +To facilitate the loading of different embedding types, I'll import the following __classes__ and __functions__ from flaiF: `WordEmbeddings`, `FlairEmbeddings`, `TransformerWordEmbeddings`, `StackedEmbeddings`, and `Sentence`. These components enable us to work with GloVe embeddings, Flair's contextual embeddings, and transformer-based embeddings from the HuggingFace library. +
+ +```{r include=FALSE} +# Load the flaiR library +library(flaiR) + +# Import embedding classes from flair +WordEmbeddings <- flair_embeddings()$WordEmbeddings +FlairEmbeddings <- flair_embeddings()$FlairEmbeddings +TransformerWordEmbeddings <- flair_embeddings()$TransformerWordEmbeddings +StackedEmbeddings <- flair_embeddings()$StackedEmbeddings + +# Import Sentence class for text processing +Sentence <- flair_data()$Sentence + +# Initialize FastText embeddings trained on Common Crawl +fasttext_embeddings <- WordEmbeddings('en-crawl') + +# Initialize Flair's forward language model embeddings trained on news data +flair_forward <- FlairEmbeddings('news-forward') + +# Initialize Flair's backward language model embeddings trained on news data +flair_backward <- FlairEmbeddings('news-backward') + +``` + +Combine three different types of embeddings into a stacked embedding model. + +This creates a stacked embedding model that combines: + +- `FastText embeddings`: Captures general word semantics +- `Forward Flair`: Captures contextual information reading text left-to-right +- `Backward Flair`: Captures contextual information reading text right-to-left + + +```{r} +stacked_embeddings <- StackedEmbeddings(list( + fasttext_embeddings, + flair_forward, + flair_backward +)) +``` + +```{r} +# Step 1: Create a Flair Sentence object from the text +sentence <- Sentence(paste(rownames(cr_glove_subset), collapse = " ")) + +# Step 2: Generate embeddings using our stacked model +stacked_embeddings$embed(sentence) + +# Step 3: Extract and store embeddings for each token +stacked_subset <- process_embeddings(sentence, verbose = TRUE) + +# Step 4: Compute transformation matrix + +st_transform <- compute_transform( + x = toks_fcm, + pre_trained = stacked_subset, + weighting = 'log' +) + +# Step 5: Generate document embeddings matrix +immig_dem_st <- dem( + x = immig_dfm, + pre_trained = stacked_subset, + transform = TRUE, + transform_matrix = st_transform, + verbose = TRUE +) + +# Step 6: Fit conText model for analysis +set.seed(2021L) +st_model <- conText(formula = immigration ~ party + gender, + data = toks_nostop_feats, + pre_trained = stacked_subset, + transform = TRUE, + transform_matrix = st_transform, + jackknife = TRUE, + confidence_level = 0.95, + permute = TRUE, + num_permutations = 100, + window = 6, + case_insensitive = TRUE, + verbose = FALSE) +``` + +### Document Embedding Matrix Construction with BERT + +
+BERT embeddings provide powerful contextual representations through their bidirectional transformer architecture. These embeddings are good at understanding context from both directions within text, generating deep contextual representations through multiple transformer layers, and leveraging pre-training on large text corpora to achieve strong performance across NLP tasks. The classic BERT base model generates 768-dimensional embeddings for each token, providing rich semantic representations. + +
+ +By utilizing the Flair framework, we also can seamlessly integrate: + +- Multiple BERT variants like RoBERTa and DistilBERT +- Cross-lingual models such as XLM-RoBERTa +- Domain-adapted BERT models +- Any transformer model available on HuggingFace + + + +```{r} +# Initialize BERT base uncased model embeddings from HuggingFace +bert_embeddings <- TransformerWordEmbeddings('bert-base-uncased') +``` + +```{r} +# Step 1: Create a Flair Sentence object from the text +sentence <- Sentence(paste(rownames(cr_glove_subset), collapse = " ")) + +# Step 2: Generate embeddings using BERT model from HugginFace +bert_embeddings$embed(sentence) + +# Step 3: Extract and store embeddings for each token +bert_subset <- process_embeddings(sentence, verbose = TRUE) + +# Step 4: Compute transformation matrix +bt_transform <- compute_transform(x = toks_fcm, + pre_trained = bert_subset, + weighting = 'log') + +# Step 5: Generate document embeddings matrix +immig_dem_bt <- dem(x = immig_dfm, + pre_trained = bert_subset, + transform = TRUE, + transform_matrix = bt_transform, + verbose = TRUE) + +# Step 6: Fit conText model for analysis +set.seed(2021L) +bt_model <- conText(formula = immigration ~ party + gender, + data = toks_nostop_feats, + pre_trained = bert_subset, + transform = TRUE, + transform_matrix = bt_transform, + jackknife = TRUE, + confidence_level = 0.95, + permute = TRUE, + num_permutations = 100, + window = 6, + case_insensitive = TRUE, + verbose = FALSE) +``` + +### Comparision of Different Embedding Approaches + +
+ +While this tutorial doesn't determine a definitive best approach, it's important to understand the key distinctions between word embedding methods. BERT, FastText, Flair Stacked Embeddings, and GloVe can be categorized into two groups: dynamic and static embeddings. + +Dynamic embeddings, particularly BERT and Flair, adapt their word representations based on context using high-dimensional vector spaces (BERT uses 768 dimensions in its base model). BERT employs self-attention mechanisms and subword tokenization, while Flair uses character-level modeling. Both effectively handle out-of-vocabulary words through these mechanisms. + +However, there is a notable difference between their case study and here. While they provide selected words, we directly extract individual word vectors from BERT and Flair (forward/backward) embeddings using the same set of words. This doesn't truly utilize BERT and Flair embeddings' capability of modeling context. A more meaningful approach would be to extract embeddings at the quasi-sentence or paragraph level, or alternatively, to pool the entire document before extracting embeddings. + +These context-based approaches stand in stark contrast to GloVe's methodology, which relies on pre-computed global word-word co-occurrence statistics to generate static word vectors. + +
+ + +```{r echo=FALSE, message = TRUE, warning = TRUE, out.width="95%"} + +st <- as.data.frame(st_model@normed_coefficients) +st["model"] <- "Flair Stacked Embeddings" +ft <- as.data.frame(ft_model@normed_coefficients) +ft["model"] <- "FastTest" +bt <- as.data.frame(bt_model@normed_coefficients) +bt["model"] <- "BERT" +ac <- as.data.frame(alc_model@normed_coefficients) +ac["model"] <- "Glove" + +merged_df <- rbind(st,ft,bt,ac) + +library(ggplot2) +# Create a faceted plot comparing different embedding results in a 2x2 grid +ggplot(merged_df, aes(x = coefficient, y = normed.estimate)) + + geom_errorbar(aes(ymin = lower.ci, ymax = upper.ci), width = 0.2) + + geom_point(size = 3) + + # Add reference line at y=0 + geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") + + theme_minimal() + + labs( + title = "Estimated Coefficients with 95% CIs", + x = "", + y = "Normalized Estimate" + ) + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + + facet_wrap(~model, nrow = 2, ncol = 2, scales = "free_y") +``` + +  + +----- + +# Cite + +``` +@Manual{, + title = {Flair NLP and flaiR for Social Science}, + author = {Yen-Chieh Liao, Sohini Timbadia and Stefan Müller}, + year = {2024}, + url = {https://davidycliao.github.io/flaiR/articles/tutorial.html} +} +``` + + +