diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 1071aa5aeb..0000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,48 +0,0 @@ -version: 2 -jobs: - build: - docker: - - image: cimg/python:3.8.11 - - working_directory: ~/gensim - - steps: - - checkout - - - restore_cache: - key: pip-cache - - - run: - name: Apt install (for latex render) - command: | - sudo apt-get -yq update - sudo apt-get -yq remove texlive-binaries --purge - sudo apt-get -yq --no-install-suggests --no-install-recommends --force-yes install dvipng texlive-latex-base texlive-latex-extra texlive-latex-recommended texlive-latex-extra texlive-fonts-recommended latexmk - sudo apt-get -yq install build-essential python3.8-dev - - - run: - name: Basic installation (tox) - command: | - python3.8 -m virtualenv venv - source venv/bin/activate - pip install tox --progress-bar off - - - run: - name: Build documentation - environment: - TOX_PARALLEL_NO_SPINNER: 1 - TOX_PIP_OPTS: --progress-bar=off - command: | - source venv/bin/activate - tox -e compile,docs -vv - - - store_artifacts: - path: docs/src/_build - destination: documentation - - - save_cache: - key: pip-cache - paths: - - "~/.cache/pip" - - "~/.ccache" - - "~/.pip-cache" diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 85e8637b86..cab6a16641 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -9,15 +9,46 @@ on: - cron: '0 0 * * sun,wed' jobs: + # + # The linters job duplicates tests.yml, can't think of a way to avoid this right now. + # + linters: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Setup up Python ${{ matrix.python }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python }} + + - name: Update pip + run: python -m pip install -U pip + + - name: Install dependencies + run: python -m pip install flake8 flake8-rst + + - name: Run flake8 linter (source) + run: flake8 --ignore E12,W503 --max-line-length 120 --show-source gensim + + # - name: Run flake8 linter (documentation) + # run: flake8 --ignore E202,E402,E302,E305,F821 --max-line-length 120 --filename '*.py,*.rst' docs + + - name: Check Sphinx Gallery cache + run: python docs/src/check_gallery.py build: + timeout-minutes: 30 runs-on: ${{ matrix.os }} defaults: run: shell: bash + + needs: [linters] + strategy: fail-fast: false matrix: - python-version: [3.6, 3.7, 3.8, 3.9] + python-version: ['3.7', '3.8', '3.9', '3.10'] os: [ubuntu-latest, macos-latest, windows-latest] platform: [x64] include: @@ -43,11 +74,6 @@ jobs: # https://github.com/scipy/oldest-supported-numpy/blob/master/setup.cfg # with the exception that we enforce the minimum version to be 1.17.0. # - - os: ubuntu-latest - manylinux-version: 2010 - python-version: 3.6 - build-depends: numpy==1.17.0 - - os: ubuntu-latest manylinux-version: 2010 python-version: 3.7 @@ -63,11 +89,10 @@ jobs: python-version: 3.9 build-depends: numpy==1.19.3 - - os: macos-latest - travis-os-name: osx - manylinux-version: 1 - python-version: 3.6 - build-depends: numpy==1.17.0 + - os: ubuntu-latest + manylinux-version: 2014 + python-version: "3.10" + build-depends: numpy==1.22.2 scipy==1.8.0 - os: macos-latest travis-os-name: osx @@ -87,10 +112,11 @@ jobs: python-version: 3.9 build-depends: numpy==1.19.3 - - os: windows-latest - manylinux-version: 2010 - python-version: 3.6 - build-depends: numpy==1.17.0 + - os: macos-latest + travis-os-name: osx + manylinux-version: 1 + python-version: "3.10" + build-depends: numpy==1.22.2 scipy==1.8.0 - os: windows-latest manylinux-version: 2010 @@ -107,6 +133,11 @@ jobs: python-version: 3.9 build-depends: numpy==1.19.3 + - os: windows-latest + manylinux-version: 2010 + python-version: "3.10" + build-depends: numpy==1.22.2 scipy==1.8.0 + env: PKG_NAME: gensim REPO_DIR: gensim @@ -114,7 +145,7 @@ jobs: PLAT: x86_64 UNICODE_WIDTH: 32 MB_PYTHON_VERSION: ${{ matrix.python-version }} # MB_PYTHON_VERSION is needed by Multibuild - TEST_DEPENDS: Morfessor==2.0.2a4 python-levenshtein==0.12.0 visdom==0.1.8.9 pytest mock cython nmslib pyemd testfixtures scikit-learn pyemd + TEST_DEPENDS: pytest mock testfixtures DOCKER_TEST_IMAGE: multibuild/xenial_x86_64 TRAVIS_OS_NAME: ${{ matrix.travis-os-name }} SKIP_NETWORK_TESTS: 1 @@ -144,7 +175,7 @@ jobs: run: | python -m pip install --upgrade pip pip install virtualenv - - name: Build and Install Wheels (Multibuild) + - name: Build Wheel (Multibuild) if: matrix.os != 'windows-latest' run: | echo ::group::Set up Multibuild @@ -156,17 +187,16 @@ jobs: before_install echo ::endgroup:: echo ::group::Build wheel + find . -type f -name "*.egg" -exec rm -v {} \; build_wheel $REPO_DIR ${{ matrix.PLAT }} echo ::endgroup:: - echo ::group::Install run - install_run ${{ matrix.PLAT }} - echo ::endgroup:: + # # We can't use multibuild on Windows, so we have to roll our own build script. # Adapted from # https://github.com/RaRe-Technologies/gensim-wheels/commit/084b863390edee05bbe15d4ec05d1ab726e52202 # - - name: Build and Install Wheels (Windows) + - name: Build Wheel (Windows) if: matrix.os == 'windows-latest' run: | echo ::group::Set up dependencies @@ -190,6 +220,50 @@ jobs: # mv dist wheelhouse + - name: Prepare for testing + run: | + # + # FIXME: Why are these eggs here? + # + # These eggs prevent the wheel from building and running on Py3.10 + # + find . -type f -name "*.egg" -exec rm -v {} \; + python -m venv test_environment + + # + # Multibuild has a test step but it essentially just installs the wheel + # and runs the test, and requires a lot of magic to get it working. + # It also does not work under Windows. + # So, we create our own simple test step here. + # + - name: Install and Test Wheel (Linux, MacOS) + if: matrix.os != 'windows-latest' + run: | + . test_environment/bin/activate + pip install pytest testfixtures mock + pip install wheelhouse/*.whl + cd test_environment + python -c 'import gensim;print(gensim.__version__)' + # + # This part relies on the wheel containing tests and required data. + # If we remove that from the wheel, we'll need to rewrite this step. + # + pytest -rfxEXs --durations=20 --disable-warnings --showlocals --pyargs gensim + + # + # We need a separate testing step for windows because the command for + # activating the virtual environment is slightly different + # + - name: Install and Test Wheel (Windows) + if: matrix.os == 'windows-latest' + run: | + test_environment/Scripts/activate.bat + pip install pytest testfixtures mock + pip install wheelhouse/*.whl + cd test_environment + python -c 'import gensim;print(gensim.__version__)' + pytest -rfxEXs --durations=20 --disable-warnings --showlocals --pyargs gensim + - name: Upload wheels to s3://gensim-wheels # # Only do this if the credentials are set. diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 41a608ef90..3cb54fe8be 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -6,8 +6,82 @@ on: branches: [ develop ] jobs: + linters: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Setup up Python ${{ matrix.python }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python }} + + - name: Update pip + run: python -m pip install -U pip + + - name: Install dependencies + run: python -m pip install flake8 flake8-rst + + - name: Run flake8 linter (source) + run: flake8 --ignore E12,W503 --max-line-length 120 --show-source gensim + + # - name: Run flake8 linter (documentation) + # run: flake8 --ignore E202,E402,E302,E305,F821 --max-line-length 120 --filename '*.py,*.rst' docs + + - name: Check Sphinx Gallery cache + run: python docs/src/check_gallery.py + + docs: + name: build documentation + timeout-minutes: 10 + runs-on: ubuntu-20.04 + defaults: + run: + shell: bash + + # + # Don't run this job unless the linters have succeeded. + # It's wasteful to test code that failed to lint, because it'll get + # re-tested once the lint errors are fixed. + # + needs: [linters] + + steps: + - uses: actions/checkout@v2 + - name: Setup up Python ${{ matrix.python }} + uses: actions/setup-python@v2 + with: + # + # We use Py3.8 here for historical reasons. + # + python-version: "3.8" + + - name: Update pip + run: python -m pip install -U pip + + - name: Install apt packages for LaTeX rendering + run: | + sudo apt-get -yq update + sudo apt-get -yq remove texlive-binaries --purge + sudo apt-get -yq --no-install-suggests --no-install-recommends --force-yes install dvipng texlive-latex-base texlive-latex-extra texlive-latex-recommended texlive-latex-extra texlive-fonts-recommended latexmk + sudo apt-get -yq install build-essential python3.8-dev + + - name: Install gensim and its dependencies + run: pip install -e .[docs] + + - name: Build documentation + run: | + python setup.py build_ext --inplace + make -C docs/src clean html + + # + # FIXME: do we want to store the built documentation somewhere, or is + # knowing that the docs built successfully enough? + # + tests: - name: ${{ matrix.name }} + name: test ${{ matrix.os }} python ${{ matrix.python }} + timeout-minutes: 30 runs-on: ${{ matrix.os }} defaults: run: @@ -16,12 +90,22 @@ jobs: fail-fast: false matrix: include: - - {name: Linux, python: 3.6, os: ubuntu-20.04, tox: 'flake8,flake8-docs'} - - {name: Linux, python: 3.6, os: ubuntu-20.04, tox: 'py36-linux'} - - {name: Linux, python: 3.7, os: ubuntu-20.04, tox: 'py37-linux'} - - {name: Linux, python: 3.8, os: ubuntu-20.04, tox: 'py38-linux'} - env: - TOX_PARALLEL_NO_SPINNER: 1 + - {python: 3.7, os: ubuntu-20.04} + - {python: 3.8, os: ubuntu-20.04} + - {python: 3.9, os: ubuntu-20.04} + - {python: '3.10', os: ubuntu-20.04} + + - {python: 3.7, os: windows-2019} + - {python: 3.8, os: windows-2019} + - {python: 3.9, os: windows-2019} + - {python: '3.10', os: windows-2019} + + # + # Don't run this job unless the linters have succeeded. + # It's wasteful to test code that failed to lint, because it'll get + # re-tested once the lint errors are fixed. + # + needs: [linters] steps: - uses: actions/checkout@v2 @@ -38,23 +122,56 @@ jobs: # https://www.scala-sbt.org/1.x/docs/Installing-sbt-on-Linux.html # - name: Update sbt + if: matrix.os == 'ubuntu-20.04' run: | echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | sudo tee /etc/apt/sources.list.d/sbt.list echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | sudo tee /etc/apt/sources.list.d/sbt_old.list curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | sudo apt-key add sudo apt-get update -y sudo apt-get install -y sbt - - name: Install tox, gdb + + - name: Install GDB & enable core dumps + if: matrix.os == 'ubuntu-20.04' run: | - pip install tox sudo apt-get update -y sudo apt-get install -y gdb - - name: Enable core dumps - run: ulimit -c unlimited -S # enable core dumps - - name: Run tox tests - run: tox -e ${{ matrix.tox }} + ulimit -c unlimited -S # enable core dumps + + - name: Install gensim and its dependencies + if: matrix.os != 'windows' + run: pip install -e .[test] + + - name: Install gensim and its dependencies (Windows) + if: matrix.os == 'windows' + run: pip install -e .[test-win] + + - name: Build + run: | + python --version + pip --version + python setup.py build_ext --inplace + + # + # Some of our tests are hanging, and I strongly suspect it's because of the coverage plugin. + # + - name: Run tests (without coverage) + if: matrix.coverage != true + run: pytest -v gensim/test + + - name: Run tests (with coverage) + if: matrix.coverage == true + run: pytest -v gensim/test --cov=gensim/ --cov-report=xml + + - name: Upload coverage to Codecov + if: matrix.coverage == true + uses: codecov/codecov-action@v2 + with: + fail_ci_if_error: true + files: ./coverage.xml + verbose: true + - name: Collect corefile - if: ${{ failure() }} + if: ${{ failure() }} && matrix.os == 'ubuntu-20.04' run: | pwd COREFILE=$(find . -maxdepth 1 -name "core*" | head -n 1) diff --git a/CHANGELOG.md b/CHANGELOG.md index 73b2f735a2..9718b90b64 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,14 +3,79 @@ Changes ## Unreleased + +## 4.2.0, 2022-04-29 + +### :+1: New features + +* [#3188](https://github.com/RaRe-Technologies/gensim/pull/3188): Add get_sentence_vector() to FastText and get_mean_vector() to KeyedVectors, by [@rock420](https://github.com/rock420) +* [#3194](https://github.com/RaRe-Technologies/gensim/pull/3194): Added random_seed parameter to make LsiModel reproducible, by [@parashardhapola](https://github.com/parashardhapola) +* [#3247](https://github.com/RaRe-Technologies/gensim/pull/3247): Sparse2Corpus: update __getitem__ to work on slices, lists and ellipsis, by [@PrimozGodec](https://github.com/PrimozGodec) +* [#3264](https://github.com/RaRe-Technologies/gensim/pull/3264): Detect when a fasttext executable is available in PATH, by [@pabs3](https://github.com/pabs3) +* [#3271](https://github.com/RaRe-Technologies/gensim/pull/3271): Added new ValueError in place of assertion error for no model data provided in lsi model, by [@mark-todd](https://github.com/mark-todd) +* [#3299](https://github.com/RaRe-Technologies/gensim/pull/3299): Enable test_word2vec_stand_alone_script by using sys.executable for python, by [@pabs3](https://github.com/pabs3) +* [#3317](https://github.com/RaRe-Technologies/gensim/pull/3317): Added `encoding` parameter to TextDirectoryCorpus, by [@Sandman-Ren](https://github.com/Sandman-Ren) +* [#2656](https://github.com/RaRe-Technologies/gensim/pull/2656): Streamlining most_similar_cosmul and evaluate_word_analogies, by [@n3hrox](https://github.com/n3hrox) + + +### :books: Tutorials and docs + +* [#3227](https://github.com/RaRe-Technologies/gensim/pull/3227): Fix FastText doc-comment example for `build_vocab` and `train` to use correct argument names, by [@HLasse](https://github.com/HLasse) +* [#3235](https://github.com/RaRe-Technologies/gensim/pull/3235): Fix TFIDF docs, by [@piskvorky](https://github.com/piskvorky) +* [#3257](https://github.com/RaRe-Technologies/gensim/pull/3257): Dictionary doc: ref FAQ entry about filter_extremes corpus migration, by [@zacchiro](https://github.com/zacchiro) +* [#3279](https://github.com/RaRe-Technologies/gensim/pull/3279): Add the FastSS and Levenshtein modules to docs, by [@piskvorky](https://github.com/piskvorky) +* [#3284](https://github.com/RaRe-Technologies/gensim/pull/3284): Documentation fixes + added CITATION.cff, by [@piskvorky](https://github.com/piskvorky) +* [#3289](https://github.com/RaRe-Technologies/gensim/pull/3289): Typos, text and code fix in LDA tutorial, by [@davebulaval](https://github.com/davebulaval) +* [#3301](https://github.com/RaRe-Technologies/gensim/pull/3301): Remove unused Jupyter screenshots, by [@pabs3](https://github.com/pabs3) +* [#3307](https://github.com/RaRe-Technologies/gensim/pull/3307): Documentation fixes, by [@piskvorky](https://github.com/piskvorky) +* [#3339](https://github.com/RaRe-Technologies/gensim/pull/3339): Fix parsing error in FastText docs, by [@MattYoon](https://github.com/MattYoon) +* [#3251](https://github.com/RaRe-Technologies/gensim/pull/3251): Apply new convention of delimiting instance params in str function, by [@menshikh-iv](https://github.com/menshikh-iv) + +### :red_circle: Bug fixes + +* [#3117](https://github.com/RaRe-Technologies/gensim/pull/3117): Ensure next_index available when loading old stored KeyedVectors models, by [@gojomo](https://github.com/gojomo) +* [#3182](https://github.com/RaRe-Technologies/gensim/pull/3182): Fix error message when Doc2Vec does not receive corpus_file or corpus iterable, by [@blainedietrich](https://github.com/blainedietrich) +* [#3190](https://github.com/RaRe-Technologies/gensim/pull/3190): Fix broken external link for LDA implementation, by [@ahaya3776](https://github.com/ahaya3776) +* [#3197](https://github.com/RaRe-Technologies/gensim/pull/3197): Fix computation of topic coherence, by [@silviatti](https://github.com/silviatti) +* [#3250](https://github.com/RaRe-Technologies/gensim/pull/3250): Make negative ns_exponent work correctly, by [@menshikh-iv](https://github.com/menshikh-iv) +* [#3282](https://github.com/RaRe-Technologies/gensim/pull/3282): Fix `str()` method in WmdSimilarity, by [@DingQK](https://github.com/DingQK) +* [#3286](https://github.com/RaRe-Technologies/gensim/pull/3286): Fixes 'not enough arguments for format string' error, by [@gilbertfrancois](https://github.com/gilbertfrancois) +* [#3309](https://github.com/RaRe-Technologies/gensim/pull/3309): Respect encoding when reading binary keyed vectors, by [@alhoo](https://github.com/alhoo) +* [#3332](https://github.com/RaRe-Technologies/gensim/pull/3332): Missing `f` prefix on f-strings fix, by [@code-review-doctor](https://github.com/code-review-doctor) + +### :warning: Removed functionality & deprecations + +### 🔮 Testing, CI, housekeeping + +* [#3230](https://github.com/RaRe-Technologies/gensim/pull/3230): Adding lifecycle configuration, by [@mpenkov](https://github.com/mpenkov) +* [#3252](https://github.com/RaRe-Technologies/gensim/pull/3252): Add Codecov to gensim repo, by [@menshikh-iv](https://github.com/menshikh-iv) +* [#3255](https://github.com/RaRe-Technologies/gensim/pull/3255): Move windows tests from azure to github actions, by [@menshikh-iv](https://github.com/menshikh-iv) +* [#3263](https://github.com/RaRe-Technologies/gensim/pull/3263): Remove commented out pytest-rerunfailures test dependency, by [@pabs3](https://github.com/pabs3) +* [#3274](https://github.com/RaRe-Technologies/gensim/pull/3274): Migrate setup.py from distutils to setuptools, by [@geojacobm6](https://github.com/geojacobm6) +* [#3298](https://github.com/RaRe-Technologies/gensim/pull/3298): test and build wheels for Py3.{7,8,9,10}, by [@mpenkov](https://github.com/mpenkov) +* [#3300](https://github.com/RaRe-Technologies/gensim/pull/3300): Fix code formatting for FT_CMD definition, by [@pabs3](https://github.com/pabs3) +* [#3303](https://github.com/RaRe-Technologies/gensim/pull/3303): add GitHub URL for PyPi, by [@andriyor](https://github.com/andriyor) +* [#3308](https://github.com/RaRe-Technologies/gensim/pull/3308): get rid of tox, build things via github actions directly, by [@mpenkov](https://github.com/mpenkov) +* [#3318](https://github.com/RaRe-Technologies/gensim/pull/3318): Clean up evaluate_word_pairs code, by [@piskvorky](https://github.com/piskvorky) +* [#3329](https://github.com/RaRe-Technologies/gensim/pull/3329): Check gallery up to date as part of CI, by [@mpenkov](https://github.com/mpenkov) +* [#3254](https://github.com/RaRe-Technologies/gensim/pull/3254): Skip blinking test `test_translate_gc` on OSX + py3.9, by [@menshikh-iv](https://github.com/menshikh-iv) +* [#3258](https://github.com/RaRe-Technologies/gensim/pull/3258): Adding another check to _check_corpus_sanity for compressed files, adding test, by [@dchaplinsky](https://github.com/dchaplinsky) +* [#3278](https://github.com/RaRe-Technologies/gensim/pull/3278): Tighten test_parallel bound, by [@austereantelope](https://github.com/austereantelope) +* [#3280](https://github.com/RaRe-Technologies/gensim/pull/3280): tighten test_topic_word, by [@austereantelope](https://github.com/austereantelope) +* [#3281](https://github.com/RaRe-Technologies/gensim/pull/3281): adjust test_parallel bound, by [@austereantelope](https://github.com/austereantelope) +* [#3297](https://github.com/RaRe-Technologies/gensim/pull/3297): Use gensim.test.utils datapath() to construct paths to the test data, by [@pabs3](https://github.com/pabs3) + + ## 4.1.2, 2021-09-17 This is a bugfix release that addresses left over compatibility issues with older versions of numpy and MacOS. + ## 4.1.1, 2021-09-14 This is a bugfix release that addresses compatibility issues with older versions of numpy. + ## 4.1.0, 2021-08-15 Gensim 4.1 brings two major new functionalities: @@ -91,11 +156,11 @@ Plus a large number of smaller improvements and fixes, as usual. * [#3142](https://github.com/RaRe-Technologies/gensim/pull/3142): Use more permanent pdf link and update code link, by [@dymil](https://github.com/dymil) * [#3141](https://github.com/RaRe-Technologies/gensim/pull/3141): Update link for online LDA paper, by [@dymil](https://github.com/dymil) * [#3133](https://github.com/RaRe-Technologies/gensim/pull/3133): Update link to Hoffman paper (online VB LDA), by [@jonaschn](https://github.com/jonaschn) -* [#3129](https://github.com/RaRe-Technologies/gensim/pull/3129): [MRG] Add bronze sponsor: TechTarget, by [@piskvorky](https://github.com/piskvorky) +* [#3129](https://github.com/RaRe-Technologies/gensim/pull/3129): Add bronze sponsor: TechTarget, by [@piskvorky](https://github.com/piskvorky) * [#3126](https://github.com/RaRe-Technologies/gensim/pull/3126): Fix typos in make_wiki_online.py and make_wikicorpus.py, by [@nicolasassi](https://github.com/nicolasassi) * [#3125](https://github.com/RaRe-Technologies/gensim/pull/3125): Improve & unify docs for dirichlet priors, by [@jonaschn](https://github.com/jonaschn) * [#3123](https://github.com/RaRe-Technologies/gensim/pull/3123): Fix hyperlink for doc2vec tutorial, by [@AdityaSoni19031997](https://github.com/AdityaSoni19031997) -* [#3121](https://github.com/RaRe-Technologies/gensim/pull/3121): [MRG] Add bronze sponsor: eaccidents.com, by [@piskvorky](https://github.com/piskvorky) +* [#3121](https://github.com/RaRe-Technologies/gensim/pull/3121): Add bronze sponsor: eaccidents.com, by [@piskvorky](https://github.com/piskvorky) * [#3120](https://github.com/RaRe-Technologies/gensim/pull/3120): Fix URL for ldamodel.py, by [@jonaschn](https://github.com/jonaschn) * [#3118](https://github.com/RaRe-Technologies/gensim/pull/3118): Fix URL in doc string, by [@jonaschn](https://github.com/jonaschn) * [#3107](https://github.com/RaRe-Technologies/gensim/pull/3107): Draw attention to sponsoring in README, by [@piskvorky](https://github.com/piskvorky) diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000000..ed3be503f0 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,31 @@ +cff-version: 1.2.0 +message: "If you use this software, please cite it as below." +authors: +- family-names: "Řehůřek" + given-names: "Radim" +title: "Gensim: Topic modelling for humans" +version: 4.1.0 +url: "https://github.com/RaRe-Technologies/gensim" +preferred-citation: + type: conference-paper + authors: + - family-names: "Řehůřek" + given-names: "Radim" + - family-names: "Sojka" + given-names: "Petr" + publisher: + name: "University of Malta" + date-published: "2010-05-22" + year: 2010 + month: 5 + start: 45 # First page number + end: 50 # Last page number + pages: 5 + title: "Software Framework for Topic Modelling with Large Corpora" + languages: ["eng"] + url: "http://is.muni.cz/publication/884893/en" + conference: + name: "Proceedings of LREC 2010 workshop New Challenges for NLP Frameworks" + city: Valetta + country: MT + location: "University of Malta, Valletta, Malta" diff --git a/README.md b/README.md index f61cd390e4..f1cb9f3ddd 100644 --- a/README.md +++ b/README.md @@ -176,4 +176,3 @@ BibTeX entry: [OpenBLAS]: http://xianyi.github.io/OpenBLAS/ [source tar.gz]: http://pypi.python.org/pypi/gensim [documentation]: http://radimrehurek.com/gensim/install.html - diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000000..3cbff53d79 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,17 @@ +# Security Policy + +## Supported Versions + +Use this section to tell people about which versions of your project are +currently being supported with security updates. + +| Version | Supported | +| ------- | ------------------ | +| 4. x | :white_check_mark: | +| < 4.0 | :x: | + +## Reporting a Vulnerability + +Open a ticket and add the "security" label to it. +Describe the vulnerability in general. +We'll reach out to you for specifics. diff --git a/azure-pipelines.yml b/azure-pipelines.yml deleted file mode 100644 index 8e8102fa12..0000000000 --- a/azure-pipelines.yml +++ /dev/null @@ -1,32 +0,0 @@ -pool: - vmImage: 'vs2017-win2016' - -strategy: - matrix: - py36: - python.version: '3.6' - TOXENV: "py36-win" - py37: - python.version: '3.7' - TOXENV: "py37-win" - py38: - python.version: '3.8' - TOXENV: "py38-win" - py39: - python.version: '3.9' - TOXENV: "py39-win" - -steps: -- task: UsePythonVersion@0 - inputs: - versionSpec: '$(python.version)' - displayName: 'Use Python $(python.version)' - -- script: | - python -m pip install --upgrade pip - python -m pip install tox - displayName: 'Install tox' - -- script: | - tox - displayName: 'Testing' diff --git a/config.sh b/config.sh index ed9bea2b31..30c2e9d8eb 100755 --- a/config.sh +++ b/config.sh @@ -31,8 +31,20 @@ function build_wheel_cmd { function run_tests { # Runs tests on installed distribution from an empty directory set -x - python --version pip freeze pytest -rfxEXs --durations=20 --disable-warnings --showlocals --pyargs gensim set +x } + +# +# We do this here because we want to upgrade pip before the wheel gets installed. +# docker_test_wrap.sh sources this file before the wheel install. The sourcing +# happens from multiple places, and some of the Python versions can be really +# ancient (e.g. when working outside a virtual environment, using the default +# Python install). +# +# We don't use pip to do the actual upgrade because something appears broken +# with the default pip on the Python 3.10 multibuild image. This is really +# dodgy, but I couldn't work out a better way to get this done. +# +python continuous_integration/upgrade_pip_py310.py diff --git a/continuous_integration/BucketLifecycleConfiguration.json b/continuous_integration/BucketLifecycleConfiguration.json new file mode 100644 index 0000000000..1512b59b5c --- /dev/null +++ b/continuous_integration/BucketLifecycleConfiguration.json @@ -0,0 +1,10 @@ +{ + "Rules": [ + { + "Expiration": {"Days": 30}, + "Filter": {"Prefix": ""}, + "ID": "Delete all files older than 30 days to save storage costs", + "Status": "Enabled" + } + ] +} diff --git a/continuous_integration/BucketLifecycleConfiguration.txt b/continuous_integration/BucketLifecycleConfiguration.txt new file mode 100644 index 0000000000..7392c06393 --- /dev/null +++ b/continuous_integration/BucketLifecycleConfiguration.txt @@ -0,0 +1,15 @@ +JSON files can't have comments, so this file is here to explain the rules in BucketLifecycleConfiguration.json. + +Our CI puts wheels in a publicly readable, privately writable S3 bucket (s3://gensim-wheels). +These wheels can be for gensim releases, in which case we fetch them and push them to PyPI when making a release. +Once the wheels are on PyPI, we don't need to keep our own copy. + +These wheels can also be development wheels: we currently build wheels on every push to develop. +These can be helpful when tracking down a problem, but they can also build up quickly, consume storage space and contribute to AWS costs. + +So, we delete all files in the gensim-wheels bucket every 90 days. +We rarely need to access wheels that are several months old, anyway. + +If you modify the JSON configuration, then you can update it using the command: + + aws --profile smart_open s3api put-bucket-lifecycle-configuration --bucket gensim-wheels --lifecycle-configuration file://continuous_integration/BucketLifecycleConfiguration.json diff --git a/continuous_integration/upgrade_pip_py310.py b/continuous_integration/upgrade_pip_py310.py new file mode 100644 index 0000000000..2a9cb68893 --- /dev/null +++ b/continuous_integration/upgrade_pip_py310.py @@ -0,0 +1,10 @@ +# This script needs to be able run under both Python 2 and 3 without crashing +# It only achieves the desired effect under Py3.10 on Linux and MacOS. +import subprocess +import sys +import tempfile +if sys.platform in ('linux', 'darwin') and sys.version_info[:2] == (3, 10): + import urllib.request + with tempfile.NamedTemporaryFile(suffix='.py') as fout: + urllib.request.urlretrieve("https://bootstrap.pypa.io/get-pip.py", fout.name) + subprocess.call([sys.executable, fout.name]) diff --git a/docs/notebooks/doc2vec-wikipedia.ipynb b/docs/notebooks/doc2vec-wikipedia.ipynb index ff4786d3fd..ca07bc40ec 100644 --- a/docs/notebooks/doc2vec-wikipedia.ipynb +++ b/docs/notebooks/doc2vec-wikipedia.ipynb @@ -4,29 +4,30 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Doc2Vec to wikipedia articles" + "# Training Doc2Vec on Wikipedia articles" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We conduct the replication to **Document Embedding with Paragraph Vectors** (http://arxiv.org/abs/1507.07998).\n", - "In this paper, they showed only DBOW results to Wikipedia data. So we replicate this experiments using not only DBOW but also DM." + "This notebook replicates the **Document Embedding with Paragraph Vectors** paper, http://arxiv.org/abs/1507.07998.\n", + "\n", + "In that paper, the authors only showed results from the DBOW (\"distributed bag of words\") mode, trained on the English Wikipedia. Here we replicate this experiment using not only DBOW, but also the DM (\"distributed memory\") mode of the Paragraph Vector algorithm aka Doc2Vec." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Basic Setup" + "## Basic setup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let's import Doc2Vec module." + "Let's import the necessary modules and set up logging. The code below assumes Python 3.7+ and Gensim 4.0+." ] }, { @@ -35,10 +36,15 @@ "metadata": {}, "outputs": [], "source": [ - "from gensim.corpora.wikicorpus import WikiCorpus\n", - "from gensim.models.doc2vec import Doc2Vec, TaggedDocument\n", + "import logging\n", + "import multiprocessing\n", "from pprint import pprint\n", - "import multiprocessing" + "\n", + "import smart_open\n", + "from gensim.corpora.wikicorpus import WikiCorpus, tokenize\n", + "from gensim.models.doc2vec import Doc2Vec, TaggedDocument\n", + "\n", + "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)" ] }, { @@ -52,285 +58,422 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "First, download the dump of all Wikipedia articles from [here](http://download.wikimedia.org/enwiki/) (you want the file enwiki-latest-pages-articles.xml.bz2, or enwiki-YYYYMMDD-pages-articles.xml.bz2 for date-specific dumps).\n", + "First, download the dump of all Wikipedia articles from [here](http://download.wikimedia.org/enwiki/latest). You want the file named `enwiki-latest-pages-articles.xml.bz2`.\n", "\n", - "Second, convert the articles to WikiCorpus. WikiCorpus construct a corpus from a Wikipedia (or other MediaWiki-based) database dump.\n", + "Second, convert that Wikipedia article dump from the arcane Wikimedia XML format into a plain text file. This will make the subsequent training faster and also allow easy inspection of the data = \"input eyeballing\".\n", "\n", - "For more details on WikiCorpus, you should access [Corpus from a Wikipedia dump](https://radimrehurek.com/gensim/corpora/wikicorpus.html)." + "We'll preprocess each article at the same time, normalizing its text to lowercase, splitting into tokens, etc. Below I use a regexp tokenizer that simply looks for alphabetic sequences as tokens. But feel free to adapt the text preprocessing to your own domain. High quality preprocessing is often critical for the final pipeline accuracy – garbage in, garbage out!" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-16 11:23:20,663 : INFO : processing article #0: 'Anarchism' (6540 tokens)\n", + "2022-04-16 11:30:53,798 : INFO : processing article #500000: 'Onward Muslim Soldiers' (517 tokens)\n", + "2022-04-16 11:36:14,662 : INFO : processing article #1000000: 'Push Upstairs' (354 tokens)\n", + "2022-04-16 11:40:59,785 : INFO : processing article #1500000: 'Small nucleolar RNA Z278' (113 tokens)\n", + "2022-04-16 11:45:58,630 : INFO : processing article #2000000: '1925–26 Boston Bruins season' (556 tokens)\n", + "2022-04-16 11:51:03,737 : INFO : processing article #2500000: 'Tessier, Saskatchewan' (119 tokens)\n", + "2022-04-16 11:56:20,254 : INFO : processing article #3000000: 'Sebezhsky District' (908 tokens)\n", + "2022-04-16 12:01:59,089 : INFO : processing article #3500000: 'Niko Peleshi' (248 tokens)\n", + "2022-04-16 12:07:23,184 : INFO : processing article #4000000: 'Kudoa gunterae' (109 tokens)\n", + "2022-04-16 12:13:08,024 : INFO : processing article #4500000: 'Danko (singer)' (699 tokens)\n", + "2022-04-16 12:19:33,734 : INFO : processing article #5000000: 'Lada West Togliatti' (253 tokens)\n", + "2022-04-16 12:22:20,928 : INFO : finished iterating over Wikipedia corpus of 5205168 documents with 3016298486 positions (total 21961341 articles, 3093120544 positions before pruning articles shorter than 50 words)\n" + ] + } + ], "source": [ - "wiki = WikiCorpus(\"enwiki-latest-pages-articles.xml.bz2\")\n", - "#wiki = WikiCorpus(\"enwiki-YYYYMMDD-pages-articles.xml.bz2\")" + "wiki = WikiCorpus(\n", + " \"enwiki-latest-pages-articles.xml.bz2\", # path to the file you downloaded above\n", + " tokenizer_func=tokenize, # simple regexp; plug in your own tokenizer here\n", + " metadata=True, # also return the article titles and ids when parsing\n", + " dictionary={}, # don't start processing the data yet\n", + ")\n", + "\n", + "with smart_open.open(\"wiki.txt.gz\", \"w\", encoding='utf8') as fout:\n", + " for article_no, (content, (page_id, title)) in enumerate(wiki.get_texts()):\n", + " title = ' '.join(title.split())\n", + " if article_no % 500000 == 0:\n", + " logging.info(\"processing article #%i: %r (%i tokens)\", article_no, title, len(content))\n", + " fout.write(f\"{title}\\t{' '.join(content)}\\n\") # title_of_article [TAB] words of the article" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Define **TaggedWikiDocument** class to convert WikiCorpus into suitable form for Doc2Vec." + "The above took about 1 hour and created a new ~5.8 GB file named `wiki.txt.gz`. Note the output text was transparently compressed into `.gz` (GZIP) right away, using the [smart_open](https://github.com/RaRe-Technologies/smart_open) library, to save on disk space.\n", + "\n", + "Next we'll set up a document stream to load the preprocessed articles from `wiki.txt.gz` one by one, in the format expected by Doc2Vec, ready for training. We don't want to load everything into RAM at once, because that would blow up the memory. And it is not necessary – Gensim can handle streamed input training data:" ] }, { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "class TaggedWikiDocument(object):\n", - " def __init__(self, wiki):\n", - " self.wiki = wiki\n", - " self.wiki.metadata = True\n", + "class TaggedWikiCorpus:\n", + " def __init__(self, wiki_text_path):\n", + " self.wiki_text_path = wiki_text_path\n", + " \n", " def __iter__(self):\n", - " for content, (page_id, title) in self.wiki.get_texts():\n", - " yield TaggedDocument([c.decode(\"utf-8\") for c in content], [title])" + " for line in smart_open.open(self.wiki_text_path, encoding='utf8'):\n", + " title, words = line.split('\\t')\n", + " yield TaggedDocument(words=words.split(), tags=[title])\n", + "\n", + "documents = TaggedWikiCorpus('wiki.txt.gz') # A streamed iterable; nothing in RAM yet." ] }, { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "documents = TaggedWikiDocument(wiki)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Preprocessing\n", - "To set the same vocabulary size with original paper. We first calculate the optimal **min_count** parameter." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "pre = Doc2Vec(min_count=0)\n", - "pre.scan_vocab(documents)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "min_count: 0, size of vocab: 8545782.0\n", - "min_count: 1, size of vocab: 8545782.0\n", - "min_count: 2, size of vocab: 4227783.0\n", - "min_count: 3, size of vocab: 3008772.0\n", - "min_count: 4, size of vocab: 2439367.0\n", - "min_count: 5, size of vocab: 2090709.0\n", - "min_count: 6, size of vocab: 1856609.0\n", - "min_count: 7, size of vocab: 1681670.0\n", - "min_count: 8, size of vocab: 1546914.0\n", - "min_count: 9, size of vocab: 1437367.0\n", - "min_count: 10, size of vocab: 1346177.0\n", - "min_count: 11, size of vocab: 1267916.0\n", - "min_count: 12, size of vocab: 1201186.0\n", - "min_count: 13, size of vocab: 1142377.0\n", - "min_count: 14, size of vocab: 1090673.0\n", - "min_count: 15, size of vocab: 1043973.0\n", - "min_count: 16, size of vocab: 1002395.0\n", - "min_count: 17, size of vocab: 964684.0\n", - "min_count: 18, size of vocab: 930382.0\n", - "min_count: 19, size of vocab: 898725.0\n" + "['Anarchism'] : anarchism is political philosophy and movement that is sceptical of authority and rejects all involuntary coercive forms of hierarchy anarchism calls for the abolition of the state which it holds to be unnecessary undesirable and harmful as historically left wing movement placed on the farthest left of the political spectrum ……… criticism of philosophical anarchism defence of philosophical anarchism stating that both kinds of anarchism philosophical and political anarchism are philosophical and political claims anarchistic popular fiction novel an argument for philosophical anarchism external links anarchy archives anarchy archives is an online research center on the history and theory of anarchism\n" ] } ], "source": [ - "for num in range(0, 20):\n", - " print('min_count: {}, size of vocab: '.format(num), pre.scale_vocab(min_count=num, dry_run=True)['memory']['vocab']/700)" + "# Load and print the first preprocessed Wikipedia document, as a sanity check = \"input eyeballing\".\n", + "first_doc = next(iter(documents))\n", + "print(first_doc.tags, ': ', ' '.join(first_doc.words[:50] + ['………'] + first_doc.words[-50:]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In the original paper, they set the vocabulary size 915,715. It seems similar size of vocabulary if we set min_count = 19. (size of vocab = 898,725)" + "The document seems legit so let's move on to finally training some Doc2vec models." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Training the Doc2Vec Model\n", - "To train Doc2Vec model by several method, DBOW and DM, we define the list of models." + "## Training Doc2Vec" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The original paper had a vocabulary size of 915,715 word types, so we'll try to match it by setting `max_final_vocab` to 1,000,000 in the Doc2vec constructor.\n", + "\n", + "Other critical parameters were left unspecified in the paper, so we'll go with a window size of eight (a prediction window of 8 tokens to either side). It looks like the authors tried vector dimensionality of 100, 300, 1,000 & 10,000 in the paper (with 10k dims performing the best), but I'll only train with 200 dimensions here, to keep the RAM in check on my laptop.\n", + "\n", + "Feel free to tinker with these values yourself if you like:" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": { "scrolled": false }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-18 12:05:46,344 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec', 'datetime': '2022-04-18T12:05:46.344471', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'created'}\n", + "2022-04-18 12:05:46,345 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec', 'datetime': '2022-04-18T12:05:46.345716', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'created'}\n" + ] + } + ], "source": [ - "cores = multiprocessing.cpu_count()\n", + "workers = 20 # multiprocessing.cpu_count() - 1 # leave one core for the OS & other stuff\n", + "\n", + "# PV-DBOW: paragraph vector in distributed bag of words mode\n", + "model_dbow = Doc2Vec(\n", + " dm=0, dbow_words=1, # dbow_words=1 to train word vectors at the same time too, not only DBOW\n", + " vector_size=200, window=8, epochs=10, workers=workers, max_final_vocab=1000000,\n", + ")\n", "\n", - "models = [\n", - " # PV-DBOW \n", - " Doc2Vec(dm=0, dbow_words=1, size=200, window=8, min_count=19, iter=10, workers=cores),\n", - " # PV-DM w/average\n", - " Doc2Vec(dm=1, dm_mean=1, size=200, window=8, min_count=19, iter =10, workers=cores),\n", - "]" + "# PV-DM: paragraph vector in distributed memory mode\n", + "model_dm = Doc2Vec(\n", + " dm=1, dm_mean=1, # use average of context word vectors to train DM\n", + " vector_size=200, window=8, epochs=10, workers=workers, max_final_vocab=1000000,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run one pass through the Wikipedia corpus, to collect the 1M vocabulary and initialize the doc2vec models:" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-18 12:05:47,311 : INFO : collecting all words and their counts\n", + "2022-04-18 12:05:47,313 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags\n", + "2022-04-18 12:07:35,880 : INFO : PROGRESS: at example #500000, processed 656884578 words (6050478 words/s), 3221051 word types, 500000 tags\n", + "2022-04-18 12:08:38,784 : INFO : PROGRESS: at example #1000000, processed 1021477892 words (5796084 words/s), 4478830 word types, 1000000 tags\n", + "2022-04-18 12:09:29,607 : INFO : PROGRESS: at example #1500000, processed 1308608477 words (5649726 words/s), 5419923 word types, 1500000 tags\n", + "2022-04-18 12:10:13,477 : INFO : PROGRESS: at example #2000000, processed 1554211349 words (5598537 words/s), 6190970 word types, 2000000 tags\n", + "2022-04-18 12:10:56,549 : INFO : PROGRESS: at example #2500000, processed 1794853915 words (5587147 words/s), 6943275 word types, 2500000 tags\n", + "2022-04-18 12:11:39,668 : INFO : PROGRESS: at example #3000000, processed 2032520202 words (5511955 words/s), 7668721 word types, 3000000 tags\n", + "2022-04-18 12:12:23,192 : INFO : PROGRESS: at example #3500000, processed 2268859232 words (5430192 words/s), 8352590 word types, 3500000 tags\n", + "2022-04-18 12:13:02,526 : INFO : PROGRESS: at example #4000000, processed 2493668037 words (5715482 words/s), 8977844 word types, 4000000 tags\n", + "2022-04-18 12:13:42,550 : INFO : PROGRESS: at example #4500000, processed 2709484503 words (5392235 words/s), 9612299 word types, 4500000 tags\n", + "2022-04-18 12:14:21,813 : INFO : PROGRESS: at example #5000000, processed 2932680226 words (5684768 words/s), 10226832 word types, 5000000 tags\n", + "2022-04-18 12:14:51,346 : INFO : collected 10469247 word types and 5205168 unique tags from a corpus of 5205168 examples and 3016298486 words\n", + "2022-04-18 12:14:55,076 : INFO : Doc2Vec lifecycle event {'msg': 'max_final_vocab=1000000 and min_count=5 resulted in calc_min_count=23, effective_min_count=23', 'datetime': '2022-04-18T12:14:55.076153', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'}\n", + "2022-04-18 12:14:55,076 : INFO : Creating a fresh vocabulary\n", + "2022-04-18 12:14:58,906 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=23 retains 996522 unique words (9.52% of original 10469247, drops 9472725)', 'datetime': '2022-04-18T12:14:58.906148', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'}\n", + "2022-04-18 12:14:58,906 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=23 leaves 2988436691 word corpus (99.08% of original 3016298486, drops 27861795)', 'datetime': '2022-04-18T12:14:58.906730', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'}\n", + "2022-04-18 12:15:01,747 : INFO : deleting the raw counts dictionary of 10469247 items\n", + "2022-04-18 12:15:01,860 : INFO : sample=0.001 downsamples 23 most-common words\n", + "2022-04-18 12:15:01,861 : INFO : Doc2Vec lifecycle event {'msg': 'downsampling leaves estimated 2431447874.2898555 word corpus (81.4%% of prior 2988436691)', 'datetime': '2022-04-18T12:15:01.861332', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'}\n", + "2022-04-18 12:15:07,001 : INFO : estimated required memory for 996522 words and 200 dimensions: 7297864200 bytes\n", + "2022-04-18 12:15:07,002 : INFO : resetting layer weights\n", + "2022-04-18 12:15:10,247 : INFO : resetting layer weights\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec(dbow+w,d200,hs,w8,mc19,t8)\n", - "Doc2Vec(dm/m,d200,hs,w8,mc19,t8)\n" + "Doc2Vec\n", + "Doc2Vec\n" ] } ], "source": [ - "models[0].build_vocab(documents)\n", - "print(str(models[0]))\n", - "models[1].reset_from(models[0])\n", - "print(str(models[1]))" + "model_dbow.build_vocab(documents, progress_per=500000)\n", + "print(model_dbow)\n", + "\n", + "# Save some time by copying the vocabulary structures from the DBOW model to the DM model.\n", + "# Both models are built on top of exactly the same data, so there's no need to repeat the vocab-building step.\n", + "model_dm.reset_from(model_dbow)\n", + "print(model_dm)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now we’re ready to train Doc2Vec of the English Wikipedia. " + "Now we’re ready to train Doc2Vec on the entirety of the English Wikipedia. **Warning!** Training this DBOW model takes ~14 hours, and DM ~6 hours, on my 2020 Linux machine." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": { "scrolled": true }, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "CPU times: user 5d 18h 24min 30s, sys: 26min 6s, total: 5d 18h 50min 36s\n", - "Wall time: 1d 2h 58min 58s\n", - "CPU times: user 1d 1h 28min 2s, sys: 33min 15s, total: 1d 2h 1min 18s\n", - "Wall time: 15h 27min 18s\n" + "2022-04-18 12:15:13,503 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 20 workers on 996522 vocabulary and 200 features, using sg=1 hs=0 sample=0.001 negative=5 window=8 shrink_windows=True', 'datetime': '2022-04-18T12:15:13.503265', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'train'}\n", + "2022-04-18 12:15:14,566 : INFO : EPOCH 0 - PROGRESS: at 0.00% examples, 299399 words/s, in_qsize 38, out_qsize 1\n", + "2022-04-18 12:45:14,574 : INFO : EPOCH 0 - PROGRESS: at 20.47% examples, 469454 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 13:15:14,578 : INFO : EPOCH 0 - PROGRESS: at 61.04% examples, 470927 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 13:40:53,256 : INFO : EPOCH 0: training on 3016298486 raw words (2421756111 effective words) took 5139.7s, 471184 effective words/s\n", + "2022-04-18 13:40:54,274 : INFO : EPOCH 1 - PROGRESS: at 0.00% examples, 401497 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 14:10:54,283 : INFO : EPOCH 1 - PROGRESS: at 21.90% examples, 488616 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 14:40:54,290 : INFO : EPOCH 1 - PROGRESS: at 63.73% examples, 485374 words/s, in_qsize 40, out_qsize 0\n", + "2022-04-18 15:04:11,566 : INFO : EPOCH 1: training on 3016298486 raw words (2421755370 effective words) took 4998.3s, 484515 effective words/s\n", + "2022-04-18 15:04:12,590 : INFO : EPOCH 2 - PROGRESS: at 0.00% examples, 413109 words/s, in_qsize 38, out_qsize 2\n", + "2022-04-18 15:34:12,592 : INFO : EPOCH 2 - PROGRESS: at 21.94% examples, 489186 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 16:04:12,595 : INFO : EPOCH 2 - PROGRESS: at 64.02% examples, 487045 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 16:27:13,124 : INFO : EPOCH 2: training on 3016298486 raw words (2421749843 effective words) took 4981.6s, 486143 effective words/s\n", + "2022-04-18 16:27:14,132 : INFO : EPOCH 3 - PROGRESS: at 0.00% examples, 425720 words/s, in_qsize 37, out_qsize 0\n", + "2022-04-18 16:57:14,170 : INFO : EPOCH 3 - PROGRESS: at 22.16% examples, 492364 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 17:27:14,181 : INFO : EPOCH 3 - PROGRESS: at 64.36% examples, 489039 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 17:49:58,875 : INFO : EPOCH 3: training on 3016298486 raw words (2421759041 effective words) took 4965.7s, 487693 effective words/s\n", + "2022-04-18 17:49:59,888 : INFO : EPOCH 4 - PROGRESS: at 0.00% examples, 405295 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 18:19:59,893 : INFO : EPOCH 4 - PROGRESS: at 21.95% examples, 489379 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 18:49:59,917 : INFO : EPOCH 4 - PROGRESS: at 63.77% examples, 485582 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 19:13:19,358 : INFO : EPOCH 4: training on 3016298486 raw words (2421753794 effective words) took 5000.5s, 484304 effective words/s\n", + "2022-04-18 19:13:20,362 : INFO : EPOCH 5 - PROGRESS: at 0.00% examples, 417569 words/s, in_qsize 38, out_qsize 1\n", + "2022-04-18 19:43:20,366 : INFO : EPOCH 5 - PROGRESS: at 22.18% examples, 492529 words/s, in_qsize 40, out_qsize 0\n", + "2022-04-18 20:13:20,367 : INFO : EPOCH 5 - PROGRESS: at 64.36% examples, 489058 words/s, in_qsize 39, out_qsize 1\n", + "2022-04-18 20:36:01,806 : INFO : EPOCH 5: training on 3016298486 raw words (2421774390 effective words) took 4962.4s, 488021 effective words/s\n", + "2022-04-18 20:36:02,845 : INFO : EPOCH 6 - PROGRESS: at 0.00% examples, 376602 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 21:06:02,845 : INFO : EPOCH 6 - PROGRESS: at 21.77% examples, 486989 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 21:36:02,858 : INFO : EPOCH 6 - PROGRESS: at 63.44% examples, 483745 words/s, in_qsize 40, out_qsize 0\n", + "2022-04-18 21:59:40,920 : INFO : EPOCH 6: training on 3016298486 raw words (2421753569 effective words) took 5019.1s, 482507 effective words/s\n", + "2022-04-18 21:59:41,945 : INFO : EPOCH 7 - PROGRESS: at 0.00% examples, 410164 words/s, in_qsize 38, out_qsize 1\n", + "2022-04-18 22:29:41,989 : INFO : EPOCH 7 - PROGRESS: at 22.09% examples, 491334 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 22:59:42,000 : INFO : EPOCH 7 - PROGRESS: at 64.16% examples, 487826 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 23:22:40,504 : INFO : EPOCH 7: training on 3016298486 raw words (2421770259 effective words) took 4979.6s, 486340 effective words/s\n", + "2022-04-18 23:22:41,509 : INFO : EPOCH 8 - PROGRESS: at 0.00% examples, 294981 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 23:52:41,532 : INFO : EPOCH 8 - PROGRESS: at 21.64% examples, 485279 words/s, in_qsize 40, out_qsize 0\n", + "2022-04-19 00:22:41,533 : INFO : EPOCH 8 - PROGRESS: at 63.05% examples, 481687 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 00:46:43,879 : INFO : EPOCH 8: training on 3016298486 raw words (2421753439 effective words) took 5043.4s, 480185 effective words/s\n", + "2022-04-19 00:46:44,905 : INFO : EPOCH 9 - PROGRESS: at 0.00% examples, 383709 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 01:16:44,926 : INFO : EPOCH 9 - PROGRESS: at 21.82% examples, 487579 words/s, in_qsize 40, out_qsize 0\n", + "2022-04-19 01:46:44,928 : INFO : EPOCH 9 - PROGRESS: at 63.44% examples, 483731 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 02:10:25,029 : INFO : EPOCH 9: training on 3016298486 raw words (2421762745 effective words) took 5021.1s, 482313 effective words/s\n", + "2022-04-19 02:10:25,030 : INFO : Doc2Vec lifecycle event {'msg': 'training on 30162984860 raw words (24217588561 effective words) took 50111.5s, 483274 effective words/s', 'datetime': '2022-04-19T02:10:25.030386', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'train'}\n" ] } ], "source": [ - "for model in models:\n", - " %%time model.train(documents, total_examples=model.corpus_count, epochs=model.iter)" + "# Train DBOW doc2vec incl. word vectors.\n", + "# Report progress every ½ hour.\n", + "model_dbow.train(documents, total_examples=model_dbow.corpus_count, epochs=model_dbow.epochs, report_delay=30*60)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-19 02:10:25,033 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 20 workers on 996522 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=5 window=8 shrink_windows=True', 'datetime': '2022-04-19T02:10:25.033682', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'train'}\n", + "2022-04-19 02:10:26,039 : INFO : EPOCH 0 - PROGRESS: at 0.01% examples, 1154750 words/s, in_qsize 0, out_qsize 2\n", + "2022-04-19 02:40:26,040 : INFO : EPOCH 0 - PROGRESS: at 83.97% examples, 1182619 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 02:44:58,625 : INFO : EPOCH 0: training on 3016298486 raw words (2421749575 effective words) took 2073.6s, 1167903 effective words/s\n", + "2022-04-19 02:44:59,635 : INFO : EPOCH 1 - PROGRESS: at 0.01% examples, 1565065 words/s, in_qsize 0, out_qsize 0\n", + "2022-04-19 03:14:59,636 : INFO : EPOCH 1 - PROGRESS: at 84.22% examples, 1185115 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 03:19:27,814 : INFO : EPOCH 1: training on 3016298486 raw words (2421738810 effective words) took 2069.2s, 1170383 effective words/s\n", + "2022-04-19 03:19:28,819 : INFO : EPOCH 2 - PROGRESS: at 0.01% examples, 1582102 words/s, in_qsize 0, out_qsize 0\n", + "2022-04-19 03:49:28,822 : INFO : EPOCH 2 - PROGRESS: at 84.33% examples, 1186338 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 03:53:55,901 : INFO : EPOCH 2: training on 3016298486 raw words (2421754027 effective words) took 2068.1s, 1171014 effective words/s\n", + "2022-04-19 03:53:56,905 : INFO : EPOCH 3 - PROGRESS: at 0.01% examples, 1586215 words/s, in_qsize 0, out_qsize 0\n", + "2022-04-19 04:23:56,914 : INFO : EPOCH 3 - PROGRESS: at 84.30% examples, 1186028 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 04:28:23,932 : INFO : EPOCH 3: training on 3016298486 raw words (2421734506 effective words) took 2068.0s, 1171036 effective words/s\n", + "2022-04-19 04:28:24,943 : INFO : EPOCH 4 - PROGRESS: at 0.01% examples, 1594202 words/s, in_qsize 0, out_qsize 0\n", + "2022-04-19 04:58:24,946 : INFO : EPOCH 4 - PROGRESS: at 84.53% examples, 1188348 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 05:02:49,190 : INFO : EPOCH 4: training on 3016298486 raw words (2421739011 effective words) took 2065.3s, 1172611 effective words/s\n", + "2022-04-19 05:02:50,203 : INFO : EPOCH 5 - PROGRESS: at 0.01% examples, 1590285 words/s, in_qsize 0, out_qsize 0\n", + "2022-04-19 05:32:50,205 : INFO : EPOCH 5 - PROGRESS: at 84.51% examples, 1188165 words/s, in_qsize 38, out_qsize 0\n", + "2022-04-19 05:37:12,922 : INFO : EPOCH 5: training on 3016298486 raw words (2421759651 effective words) took 2063.7s, 1173488 effective words/s\n", + "2022-04-19 05:37:13,928 : INFO : EPOCH 6 - PROGRESS: at 0.01% examples, 1574494 words/s, in_qsize 0, out_qsize 0\n", + "2022-04-19 06:07:13,930 : INFO : EPOCH 6 - PROGRESS: at 84.61% examples, 1189231 words/s, in_qsize 40, out_qsize 0\n", + "2022-04-19 06:11:35,588 : INFO : EPOCH 6: training on 3016298486 raw words (2421751669 effective words) took 2062.7s, 1174090 effective words/s\n", + "2022-04-19 06:11:36,605 : INFO : EPOCH 7 - PROGRESS: at 0.01% examples, 1584768 words/s, in_qsize 0, out_qsize 0\n", + "2022-04-19 06:41:36,617 : INFO : EPOCH 7 - PROGRESS: at 84.50% examples, 1188066 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 06:46:00,286 : INFO : EPOCH 7: training on 3016298486 raw words (2421751802 effective words) took 2064.7s, 1172935 effective words/s\n", + "2022-04-19 06:46:01,290 : INFO : EPOCH 8 - PROGRESS: at 0.01% examples, 1610826 words/s, in_qsize 0, out_qsize 0\n", + "2022-04-19 07:16:01,295 : INFO : EPOCH 8 - PROGRESS: at 84.71% examples, 1190249 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 07:20:20,193 : INFO : EPOCH 8: training on 3016298486 raw words (2421731383 effective words) took 2059.9s, 1175653 effective words/s\n", + "2022-04-19 07:20:21,198 : INFO : EPOCH 9 - PROGRESS: at 0.01% examples, 1591209 words/s, in_qsize 0, out_qsize 0\n", + "2022-04-19 07:50:21,200 : INFO : EPOCH 9 - PROGRESS: at 84.65% examples, 1189549 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 07:54:42,812 : INFO : EPOCH 9: training on 3016298486 raw words (2421765551 effective words) took 2062.6s, 1174124 effective words/s\n", + "2022-04-19 07:54:42,813 : INFO : Doc2Vec lifecycle event {'msg': 'training on 30162984860 raw words (24217475985 effective words) took 20657.8s, 1172317 effective words/s', 'datetime': '2022-04-19T07:54:42.813436', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'train'}\n" + ] + } + ], + "source": [ + "# Train DM doc2vec.\n", + "model_dm.train(documents, total_examples=model_dm.corpus_count, epochs=model_dm.epochs, report_delay=30*60)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Similarity interface" + "## Finding similar documents" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "After that, let's test both models! DBOW model show similar results with the original paper. First, calculating cosine similarity of \"Machine learning\" using Paragraph Vector. Word Vector and Document Vector are separately stored. We have to add .docvecs after model name to extract Document Vector from Doc2Vec Model." + "After that, let's test both models! The DBOW model shows similar results as the original paper.\n", + "\n", + "First, calculate the most similar Wikipedia articles to the \"Machine learning\" article. The calculated word vectors and document vectors are stored separately, in `model.wv` and `model.dv` respectively:" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec(dbow+w,d200,hs,w8,mc19,t8)\n", - "[('Theoretical computer science', 0.7256590127944946),\n", - " ('Artificial neural network', 0.7162272930145264),\n", - " ('Pattern recognition', 0.6948175430297852),\n", - " ('Data mining', 0.6938608884811401),\n", - " ('Bayesian network', 0.6938260197639465),\n", - " ('Support vector machine', 0.6706081628799438),\n", - " ('Glossary of artificial intelligence', 0.670173704624176),\n", - " ('Computational learning theory', 0.6648679971694946),\n", - " ('Outline of computer science', 0.6638073921203613),\n", - " ('List of important publications in computer science', 0.663051187992096),\n", - " ('Mathematical optimization', 0.655048131942749),\n", - " ('Theory of computation', 0.6508707404136658),\n", - " ('Word-sense disambiguation', 0.6505812406539917),\n", - " ('Reinforcement learning', 0.6480429172515869),\n", - " (\"Solomonoff's theory of inductive inference\", 0.6459559202194214),\n", - " ('Computational intelligence', 0.6458009481430054),\n", - " ('Information visualization', 0.6437181234359741),\n", - " ('Algorithmic composition', 0.643247127532959),\n", - " ('Ray Solomonoff', 0.6425477862358093),\n", - " ('Kriging', 0.6425424814224243)]\n", - "Doc2Vec(dm/m,d200,hs,w8,mc19,t8)\n", - "[('Artificial neural network', 0.640324592590332),\n", - " ('Pattern recognition', 0.6244156360626221),\n", - " ('Data stream mining', 0.6140210032463074),\n", - " ('Theoretical computer science', 0.5964258909225464),\n", - " ('Outline of computer science', 0.5862746834754944),\n", - " ('Supervised learning', 0.5847170352935791),\n", - " ('Data mining', 0.5817658305168152),\n", - " ('Decision tree learning', 0.5785809755325317),\n", - " ('Bayesian network', 0.5768401622772217),\n", - " ('Computational intelligence', 0.5717238187789917),\n", - " ('Theory of computation', 0.5703311562538147),\n", - " ('Bayesian programming', 0.5693561434745789),\n", - " ('Reinforcement learning', 0.564978837966919),\n", - " ('Helmholtz machine', 0.564972460269928),\n", - " ('Inductive logic programming', 0.5631471276283264),\n", - " ('Algorithmic learning theory', 0.563083291053772),\n", - " ('Semi-supervised learning', 0.5628935694694519),\n", - " ('Early stopping', 0.5597405433654785),\n", - " ('Decision tree', 0.5596889853477478),\n", - " ('Artificial intelligence', 0.5569720268249512)]\n" + "Doc2Vec\n", + "[('Supervised learning', 0.7491602301597595),\n", + " ('Pattern recognition', 0.7462332844734192),\n", + " ('Artificial neural network', 0.7142727971076965),\n", + " ('Data mining', 0.6930587887763977),\n", + " ('Computer mathematics', 0.686907947063446),\n", + " ('Deep learning', 0.6868096590042114),\n", + " ('Multi-task learning', 0.6859176158905029),\n", + " ('Outline of computer science', 0.6858125925064087),\n", + " ('Boosting (machine learning)', 0.6807966828346252),\n", + " ('Linear classifier', 0.6807013154029846),\n", + " ('Learning classifier system', 0.679194450378418),\n", + " ('Knowledge retrieval', 0.6765366196632385),\n", + " ('Perceptron', 0.675654947757721),\n", + " ('Incremental learning', 0.6712607741355896),\n", + " ('Support-vector machine', 0.6711161136627197),\n", + " ('Feature selection', 0.6696343421936035),\n", + " ('Image segmentation', 0.6688867211341858),\n", + " ('Neural network', 0.6670624017715454),\n", + " ('Reinforcement learning', 0.6666402220726013),\n", + " ('Feature extraction', 0.6657401323318481)]\n", + "Doc2Vec\n", + "[('Pattern recognition', 0.7151365280151367),\n", + " ('Supervised learning', 0.7006939053535461),\n", + " ('Multi-task learning', 0.6899284720420837),\n", + " ('Semi-supervised learning', 0.674682080745697),\n", + " ('Statistical classification', 0.6649825572967529),\n", + " ('Deep learning', 0.6647047400474548),\n", + " ('Artificial neural network', 0.66275954246521),\n", + " ('Feature selection', 0.6612880825996399),\n", + " ('Statistical learning theory', 0.6528184413909912),\n", + " ('Naive Bayes classifier', 0.6506016850471497),\n", + " ('Automatic image annotation', 0.6491228342056274),\n", + " ('Regularization (mathematics)', 0.6452057957649231),\n", + " ('Early stopping', 0.6439507007598877),\n", + " ('Support-vector machine', 0.64285808801651),\n", + " ('Meta learning (computer science)', 0.6418778300285339),\n", + " ('Linear classifier', 0.6391816735267639),\n", + " ('Empirical risk minimization', 0.6339778900146484),\n", + " ('Anomaly detection', 0.6328380703926086),\n", + " ('Predictive Model Markup Language', 0.6314322352409363),\n", + " ('Learning classifier system', 0.6307871341705322)]\n" ] } ], "source": [ - "for model in models:\n", - " print(str(model))\n", - " pprint(model.docvecs.most_similar(positive=[\"Machine learning\"], topn=20))" + "for model in [model_dbow, model_dm]:\n", + " print(model)\n", + " pprint(model.dv.most_similar(positive=[\"Machine learning\"], topn=20))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "DBOW model interpret the word 'Machine Learning' as a part of Computer Science field, and DM model as Data Science related field.\n", + "Both results seem similar and match the results from the paper's Table 1, although not exactly. This is because we don't know the exact parameters of the original implementation (see above). And also because we're training the model 7 years later and the Wikipedia content has changed in the meantime.\n", "\n", - "Second, calculating cosine simillarity of \"Lady Gaga\" using Paragraph Vector." + "Now following the paper's Table 2a), let's calculate the most similar Wikipedia entries to \"Lady Gaga\" using Paragraph Vector:" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": { "scrolled": false }, @@ -339,35 +482,35 @@ "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec(dbow+w,d200,hs,w8,mc19,t8)\n", - "[('Katy Perry', 0.7374469637870789),\n", - " ('Adam Lambert', 0.6972734928131104),\n", - " ('Miley Cyrus', 0.6212848424911499),\n", - " ('List of awards and nominations received by Lady Gaga', 0.6138384938240051),\n", - " ('Nicole Scherzinger', 0.6092700958251953),\n", - " ('Christina Aguilera', 0.6062655448913574),\n", - " ('Nicki Minaj', 0.6019431948661804),\n", - " ('Taylor Swift', 0.5973174571990967),\n", - " ('The Pussycat Dolls', 0.5888757705688477),\n", - " ('Beyoncé', 0.5844652652740479)]\n", - "Doc2Vec(dm/m,d200,hs,w8,mc19,t8)\n", - "[('ArtRave: The Artpop Ball', 0.5719832181930542),\n", - " ('Artpop', 0.5651129484176636),\n", - " ('Katy Perry', 0.5571318864822388),\n", - " ('The Fame', 0.5388195514678955),\n", - " ('The Fame Monster', 0.5380634069442749),\n", - " ('G.U.Y.', 0.5365751385688782),\n", - " ('Beautiful, Dirty, Rich', 0.5329179763793945),\n", - " ('Applause (Lady Gaga song)', 0.5328119993209839),\n", - " ('The Monster Ball Tour', 0.5299569368362427),\n", - " ('Lindsey Stirling', 0.5281971096992493)]\n" + "Doc2Vec\n", + "[('Katy Perry', 0.7450265884399414),\n", + " ('Miley Cyrus', 0.7275323867797852),\n", + " ('Ariana Grande', 0.7223592400550842),\n", + " ('Adele', 0.6982873678207397),\n", + " ('Taylor Swift', 0.6901045441627502),\n", + " ('Demi Lovato', 0.6819911003112793),\n", + " ('Adam Lambert', 0.6552075147628784),\n", + " ('Nicki Minaj', 0.6513625383377075),\n", + " ('Selena Gomez', 0.6427122354507446),\n", + " ('Rihanna', 0.6323978304862976)]\n", + "Doc2Vec\n", + "[('Born This Way (album)', 0.6612793803215027),\n", + " ('Artpop', 0.6428781747817993),\n", + " ('Beautiful, Dirty, Rich', 0.6408763527870178),\n", + " ('Lady Gaga videography', 0.6143141388893127),\n", + " ('Lady Gaga discography', 0.6102882027626038),\n", + " ('Katy Perry', 0.6046711802482605),\n", + " ('Beyoncé', 0.6015700697898865),\n", + " ('List of Lady Gaga live performances', 0.5977909564971924),\n", + " ('Artpop (song)', 0.5930275917053223),\n", + " ('Born This Way (song)', 0.5911758542060852)]\n" ] } ], "source": [ - "for model in models:\n", - " print(str(model))\n", - " pprint(model.docvecs.most_similar(positive=[\"Lady Gaga\"], topn=10))" + "for model in [model_dbow, model_dm]:\n", + " print(model)\n", + " pprint(model.dv.most_similar(positive=[\"Lady Gaga\"], topn=10))" ] }, { @@ -376,14 +519,18 @@ "collapsed": true }, "source": [ - "DBOW model reveal the similar singer in the U.S., and DM model understand that many of Lady Gaga's songs are similar with the word \"Lady Gaga\".\n", + "The DBOW results are in line with what the paper shows in Table 2a), revealing similar singers in the U.S.\n", "\n", - "Third, calculating cosine simillarity of \"Lady Gaga\" - \"American\" + \"Japanese\" using Document vector and Word Vectors. \"American\" and \"Japanese\" are Word Vectors, not Paragraph Vectors. Word Vectors are already converted to lowercases by WikiCorpus." + "Interestingly, the DM results seem to capture more \"fact about Lady Gaga\" (her albums, trivia), whereas DBOW recovered \"similar artists\".\n", + "\n", + "**Finally, let's do some of the wilder arithmetics that vectors embeddings are famous for**. What are the entries most similar to \"Lady Gaga\" - \"American\" + \"Japanese\"? Table 2b) in the paper.\n", + "\n", + "Note that \"American\" and \"Japanese\" are word vectors, but they live in the same space as the document vectors so we can add / subtract them at will, for some interesting results. All word vectors were already lowercased by our tokenizer above, so we look for the lowercased version here:" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": { "scrolled": false }, @@ -392,51 +539,100 @@ "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec(dbow+w,d200,hs,w8,mc19,t8)\n", - "[('Game (Perfume album)', 0.5571034550666809),\n", - " ('Katy Perry', 0.5537782311439514),\n", - " ('Taboo (Kumi Koda song)', 0.5304880142211914),\n", - " ('Kylie Minogue', 0.5234110355377197),\n", - " ('Ayumi Hamasaki', 0.5110630989074707),\n", - " (\"Girls' Generation\", 0.4996713399887085),\n", - " ('Britney Spears', 0.49094343185424805),\n", - " ('Koda Kumi', 0.48719698190689087),\n", - " ('Perfume (Japanese band)', 0.48536181449890137),\n", - " ('Kara (South Korean band)', 0.48507893085479736)]\n", - "Doc2Vec(dm/m,d200,hs,w8,mc19,t8)\n", - "[('Artpop', 0.47699037194252014),\n", - " ('Jessie J', 0.4439432621002197),\n", - " ('Haus of Gaga', 0.43463900685310364),\n", - " ('The Fame', 0.4278091788291931),\n", - " ('List of awards and nominations received by Lady Gaga', 0.4268512427806854),\n", - " ('Applause (Lady Gaga song)', 0.41563737392425537),\n", - " ('New Cutie Honey', 0.4152414798736572),\n", - " ('M.I.A. (rapper)', 0.4091864228248596),\n", - " ('Mama Do (Uh Oh, Uh Oh)', 0.4044945538043976),\n", - " ('The Fame Monster', 0.40421998500823975)]\n" + "Doc2Vec\n", + "[('Ayumi Hamasaki', 0.6339365839958191),\n", + " ('Katy Perry', 0.5903329849243164),\n", + " ('2NE1', 0.5886631608009338),\n", + " (\"Girls' Generation\", 0.5769038796424866),\n", + " ('Flying Easy Loving Crazy', 0.5748921036720276),\n", + " ('Love Life 2', 0.5738793611526489),\n", + " ('Ariana Grande', 0.5715743899345398),\n", + " ('Game (Perfume album)', 0.569789707660675),\n", + " ('We Are \"Lonely Girl\"', 0.5696560740470886),\n", + " ('H (Ayumi Hamasaki EP)', 0.5691372156143188)]\n", + "Doc2Vec\n", + "[('Radwimps', 0.548571765422821),\n", + " ('Chisato Moritaka', 0.5456540584564209),\n", + " ('Suzuki Ami Around the World: Live House Tour 2005', 0.5375290513038635),\n", + " ('Anna Suda', 0.5338292121887207),\n", + " ('Beautiful, Dirty, Rich', 0.5309030413627625),\n", + " ('Momoiro Clover Z', 0.5304197072982788),\n", + " ('Pink Lady (duo)', 0.5268998742103577),\n", + " ('Reol (singer)', 0.5237400531768799),\n", + " ('Ami Suzuki', 0.5232592225074768),\n", + " ('Kaela Kimura', 0.5219823122024536)]\n" ] } ], "source": [ - "for model in models:\n", - " print(str(model))\n", - " vec = [model.docvecs[\"Lady Gaga\"] - model[\"american\"] + model[\"japanese\"]]\n", - " pprint([m for m in model.docvecs.most_similar(vec, topn=11) if m[0] != \"Lady Gaga\"])" + "for model in [model_dbow, model_dm]:\n", + " print(model)\n", + " vec = [model.dv[\"Lady Gaga\"] - model.wv[\"american\"] + model.wv[\"japanese\"]]\n", + " pprint([m for m in model.dv.most_similar(vec, topn=11) if m[0] != \"Lady Gaga\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "As a result, DBOW model demonstrate similar artists to Lady Gaga in Japan such as 'Perfume', who is the most famous idol in Japan. On the other hand, DM model results don't include Japanese artists in top 10 similar documents. It's almost the same with no vector calculated results.\n", + "As a result, the DBOW model surfaced artists similar to Lady Gaga in Japan, such as **Ayumi Hamasaki** whose Wiki bio says:\n", + "\n", + "> Ayumi Hamasaki is a Japanese singer, songwriter, record producer, actress, model, spokesperson, and entrepreneur.\n", "\n", - "These results demonstrate that the DBOW employed in the original paper is outstanding for calculating similarity between Document Vector and Word Vector." + "So that sounds like a success. It's also the nr. 1 hit in the paper we're replicating – success!\n", + "\n", + "The DM model results are opaque to me, but seem art & Japan related as well. The score deltas between these DM results are marginal, so it's likely they would change if retrained on a different version of Wikipedia. Or even when simply re-run on the same version – the doc2vec training algorithm is stochastic.\n", + "\n", + "These results demonstrate that both training modes employed in the original paper are outstanding for calculating similarity between document vectors, word vectors, or a combination of both. The DM mode has the added advantage of being 4x faster to train." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you wanted to continue working with these trained models, you could save them to disk, to avoid having to re-train the models from scratch every time:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-19 07:54:48,399 : INFO : Doc2Vec lifecycle event {'fname_or_handle': 'doc2vec_dbow.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-04-19T07:54:48.399560', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'saving'}\n", + "2022-04-19 07:54:48,400 : INFO : storing np array 'vectors' to doc2vec_dbow.model.dv.vectors.npy\n", + "2022-04-19 07:54:49,613 : INFO : storing np array 'vectors' to doc2vec_dbow.model.wv.vectors.npy\n", + "2022-04-19 07:54:49,875 : INFO : storing np array 'syn1neg' to doc2vec_dbow.model.syn1neg.npy\n", + "2022-04-19 07:54:50,135 : INFO : not storing attribute cum_table\n", + "2022-04-19 07:54:53,026 : INFO : saved doc2vec_dbow.model\n", + "2022-04-19 07:54:53,027 : INFO : Doc2Vec lifecycle event {'fname_or_handle': 'doc2vec_dm.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-04-19T07:54:53.027661', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'saving'}\n", + "2022-04-19 07:54:53,028 : INFO : storing np array 'vectors' to doc2vec_dm.model.dv.vectors.npy\n", + "2022-04-19 07:54:54,556 : INFO : storing np array 'vectors' to doc2vec_dm.model.wv.vectors.npy\n", + "2022-04-19 07:54:54,808 : INFO : storing np array 'syn1neg' to doc2vec_dm.model.syn1neg.npy\n", + "2022-04-19 07:54:55,058 : INFO : not storing attribute cum_table\n", + "2022-04-19 07:54:57,872 : INFO : saved doc2vec_dm.model\n" + ] + } + ], + "source": [ + "model_dbow.save('doc2vec_dbow.model')\n", + "model_dm.save('doc2vec_dm.model')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To continue your doc2vec explorations, refer to the official API documentation in Gensim: https://radimrehurek.com/gensim/models/doc2vec.html" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -450,7 +646,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.1" + "version": "3.8.10" } }, "nbformat": 4, diff --git a/docs/src/_templates/indexcontent.html b/docs/src/_templates/indexcontent.html index 0ca74b7210..396f48b50d 100644 --- a/docs/src/_templates/indexcontent.html +++ b/docs/src/_templates/indexcontent.html @@ -199,6 +199,15 @@

Testing Gensim

+
+

Or, to install and test Gensim locally:

+

+                      pip install -e .  # compile and install Gensim from the current directory
+                    
+

+                      pytest gensim     # run the tests
+                    
+
diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index 583e4528a9..39e29b8003 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -60,6 +60,8 @@ Modules: similarities/termsim similarities/annoy similarities/nmslib + similarities/levenshtein + similarities/fastss test/utils topic_coherence/aggregation topic_coherence/direct_confirmation_measure diff --git a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.ipynb b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.ipynb index 875db7b507..80606bfde4 100644 --- a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.ipynb +++ b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.ipynb @@ -249,7 +249,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Other formats include `Joachim's SVMlight format `_,\n`Blei's LDA-C format `_ and\n`GibbsLDA++ format `_.\n\n" + "Other formats include `Joachim's SVMlight format `_,\n`Blei's LDA-C format `_ and\n`GibbsLDA++ format `_.\n\n" ] }, { @@ -424,7 +424,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.8.10" } }, "nbformat": 4, diff --git a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py index 983a9d1235..d02e7d3418 100644 --- a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py +++ b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py @@ -222,7 +222,7 @@ def __iter__(self): ############################################################################### # Other formats include `Joachim's SVMlight format `_, -# `Blei's LDA-C format `_ and +# `Blei's LDA-C format `_ and # `GibbsLDA++ format `_. corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus) diff --git a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py.md5 b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py.md5 index 174fe2a139..860d4a2586 100644 --- a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py.md5 +++ b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py.md5 @@ -1 +1 @@ -55a8a886f05e5005c5f66d57569ee79d \ No newline at end of file +986566c5996bfc214bd711c0d2cf54db \ No newline at end of file diff --git a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.rst b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.rst index 3cc549dd65..f49b214562 100644 --- a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.rst +++ b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.rst @@ -178,12 +178,12 @@ between the questions and ids is called a dictionary: .. code-block:: none - 2021-06-01 10:34:56,824 : INFO : adding document #0 to Dictionary(0 unique tokens: []) - 2021-06-01 10:34:56,824 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions) - 2021-06-01 10:34:56,834 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)", 'datetime': '2021-06-01T10:34:56.825003', 'gensim': '4.1.0.dev0', 'python': '3.8.5 (default, Jan 27 2021, 15:41:15) \n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-73-generic-x86_64-with-glibc2.29', 'event': 'created'} - 2021-06-01 10:34:56,834 : INFO : Dictionary lifecycle event {'fname_or_handle': '/tmp/deerwester.dict', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-06-01T10:34:56.834300', 'gensim': '4.1.0.dev0', 'python': '3.8.5 (default, Jan 27 2021, 15:41:15) \n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-73-generic-x86_64-with-glibc2.29', 'event': 'saving'} - 2021-06-01 10:34:56,834 : INFO : saved /tmp/deerwester.dict - Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) + 2022-04-22 19:16:03,056 : INFO : adding document #0 to Dictionary<0 unique tokens: []> + 2022-04-22 19:16:03,057 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions) + 2022-04-22 19:16:03,068 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)", 'datetime': '2022-04-22T19:16:03.057201', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Mar 15 2022, 12:22:08) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-104-generic-x86_64-with-glibc2.29', 'event': 'created'} + 2022-04-22 19:16:03,069 : INFO : Dictionary lifecycle event {'fname_or_handle': '/tmp/deerwester.dict', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-04-22T19:16:03.069013', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Mar 15 2022, 12:22:08) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-104-generic-x86_64-with-glibc2.29', 'event': 'saving'} + 2022-04-22 19:16:03,069 : INFO : saved /tmp/deerwester.dict + Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> @@ -273,11 +273,11 @@ therefore reads: in the document `"Human computer interaction"`, the words `comp .. code-block:: none - 2021-06-01 10:34:57,074 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm - 2021-06-01 10:34:57,075 : INFO : saving sparse matrix to /tmp/deerwester.mm - 2021-06-01 10:34:57,075 : INFO : PROGRESS: saving document #0 - 2021-06-01 10:34:57,076 : INFO : saved 9x12 matrix, density=25.926% (28/108) - 2021-06-01 10:34:57,076 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index + 2022-04-22 19:16:03,436 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm + 2022-04-22 19:16:03,446 : INFO : saving sparse matrix to /tmp/deerwester.mm + 2022-04-22 19:16:03,447 : INFO : PROGRESS: saving document #0 + 2022-04-22 19:16:03,449 : INFO : saved 9x12 matrix, density=25.926% (28/108) + 2022-04-22 19:16:03,449 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index [[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]] @@ -372,7 +372,7 @@ then convert the tokens via a dictionary to their ids and yield the resulting sp .. code-block:: none - <__main__.MyCorpus object at 0x7f389b5f8520> + <__main__.MyCorpus object at 0x7ff5d5552250> @@ -450,10 +450,10 @@ Similarly, to construct the dictionary without loading all texts into memory: .. code-block:: none - 2021-06-01 10:34:58,466 : INFO : adding document #0 to Dictionary(0 unique tokens: []) - 2021-06-01 10:34:58,467 : INFO : built Dictionary(42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...) from 9 documents (total 69 corpus positions) - 2021-06-01 10:34:58,467 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...) from 9 documents (total 69 corpus positions)", 'datetime': '2021-06-01T10:34:58.467454', 'gensim': '4.1.0.dev0', 'python': '3.8.5 (default, Jan 27 2021, 15:41:15) \n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-73-generic-x86_64-with-glibc2.29', 'event': 'created'} - Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) + 2022-04-22 19:16:05,452 : INFO : adding document #0 to Dictionary<0 unique tokens: []> + 2022-04-22 19:16:05,455 : INFO : built Dictionary<42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...> from 9 documents (total 69 corpus positions) + 2022-04-22 19:16:05,455 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...> from 9 documents (total 69 corpus positions)", 'datetime': '2022-04-22T19:16:05.455728', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Mar 15 2022, 12:22:08) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-104-generic-x86_64-with-glibc2.29', 'event': 'created'} + Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> @@ -502,11 +502,11 @@ create a toy corpus of 2 documents, as a plain Python list .. code-block:: none - 2021-06-01 10:34:58,603 : INFO : storing corpus in Matrix Market format to /tmp/corpus.mm - 2021-06-01 10:34:58,604 : INFO : saving sparse matrix to /tmp/corpus.mm - 2021-06-01 10:34:58,604 : INFO : PROGRESS: saving document #0 - 2021-06-01 10:34:58,604 : INFO : saved 2x2 matrix, density=25.000% (1/4) - 2021-06-01 10:34:58,604 : INFO : saving MmCorpus index to /tmp/corpus.mm.index + 2022-04-22 19:16:05,705 : INFO : storing corpus in Matrix Market format to /tmp/corpus.mm + 2022-04-22 19:16:05,708 : INFO : saving sparse matrix to /tmp/corpus.mm + 2022-04-22 19:16:05,708 : INFO : PROGRESS: saving document #0 + 2022-04-22 19:16:05,708 : INFO : saved 2x2 matrix, density=25.000% (1/4) + 2022-04-22 19:16:05,709 : INFO : saving MmCorpus index to /tmp/corpus.mm.index @@ -514,7 +514,7 @@ create a toy corpus of 2 documents, as a plain Python list .. GENERATED FROM PYTHON SOURCE LINES 224-227 Other formats include `Joachim's SVMlight format `_, -`Blei's LDA-C format `_ and +`Blei's LDA-C format `_ and `GibbsLDA++ format `_. .. GENERATED FROM PYTHON SOURCE LINES 227-233 @@ -537,16 +537,16 @@ Other formats include `Joachim's SVMlight format .. code-block:: none - 2021-06-01 10:34:58,653 : INFO : converting corpus to SVMlight format: /tmp/corpus.svmlight - 2021-06-01 10:34:58,654 : INFO : saving SvmLightCorpus index to /tmp/corpus.svmlight.index - 2021-06-01 10:34:58,654 : INFO : no word id mapping provided; initializing from corpus - 2021-06-01 10:34:58,654 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c - 2021-06-01 10:34:58,654 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab - 2021-06-01 10:34:58,654 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index - 2021-06-01 10:34:58,707 : INFO : no word id mapping provided; initializing from corpus - 2021-06-01 10:34:58,708 : INFO : storing corpus in List-Of-Words format into /tmp/corpus.low - 2021-06-01 10:34:58,708 : WARNING : List-of-words format can only save vectors with integer elements; 1 float entries were truncated to integer value - 2021-06-01 10:34:58,708 : INFO : saving LowCorpus index to /tmp/corpus.low.index + 2022-04-22 19:16:05,818 : INFO : converting corpus to SVMlight format: /tmp/corpus.svmlight + 2022-04-22 19:16:05,820 : INFO : saving SvmLightCorpus index to /tmp/corpus.svmlight.index + 2022-04-22 19:16:05,821 : INFO : no word id mapping provided; initializing from corpus + 2022-04-22 19:16:05,821 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c + 2022-04-22 19:16:05,821 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab + 2022-04-22 19:16:05,822 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index + 2022-04-22 19:16:05,934 : INFO : no word id mapping provided; initializing from corpus + 2022-04-22 19:16:05,936 : INFO : storing corpus in List-Of-Words format into /tmp/corpus.low + 2022-04-22 19:16:05,937 : WARNING : List-of-words format can only save vectors with integer elements; 1 float entries were truncated to integer value + 2022-04-22 19:16:05,937 : INFO : saving LowCorpus index to /tmp/corpus.low.index @@ -572,9 +572,9 @@ Conversely, to load a corpus iterator from a Matrix Market file: .. code-block:: none - 2021-06-01 10:34:58,756 : INFO : loaded corpus index from /tmp/corpus.mm.index - 2021-06-01 10:34:58,757 : INFO : initializing cython corpus reader from /tmp/corpus.mm - 2021-06-01 10:34:58,757 : INFO : accepted corpus with 2 documents, 2 features, 1 non-zero entries + 2022-04-22 19:16:06,046 : INFO : loaded corpus index from /tmp/corpus.mm.index + 2022-04-22 19:16:06,048 : INFO : initializing cython corpus reader from /tmp/corpus.mm + 2022-04-22 19:16:06,048 : INFO : accepted corpus with 2 documents, 2 features, 1 non-zero entries @@ -685,10 +685,10 @@ To save the same Matrix Market document stream in Blei's LDA-C format, .. code-block:: none - 2021-06-01 10:34:59,085 : INFO : no word id mapping provided; initializing from corpus - 2021-06-01 10:34:59,086 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c - 2021-06-01 10:34:59,087 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab - 2021-06-01 10:34:59,087 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index + 2022-04-22 19:16:06,823 : INFO : no word id mapping provided; initializing from corpus + 2022-04-22 19:16:06,825 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c + 2022-04-22 19:16:06,834 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab + 2022-04-22 19:16:06,835 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index @@ -785,9 +785,9 @@ Optimize converting between corpora and NumPy/SciPy arrays?), see the :ref:`apir .. rst-class:: sphx-glr-timing - **Total running time of the script:** ( 0 minutes 3.242 seconds) + **Total running time of the script:** ( 0 minutes 5.212 seconds) -**Estimated memory usage:** 48 MB +**Estimated memory usage:** 47 MB .. _sphx_glr_download_auto_examples_core_run_corpora_and_vector_spaces.py: diff --git a/docs/src/auto_examples/core/sg_execution_times.rst b/docs/src/auto_examples/core/sg_execution_times.rst index da5c34f485..e206b6d636 100644 --- a/docs/src/auto_examples/core/sg_execution_times.rst +++ b/docs/src/auto_examples/core/sg_execution_times.rst @@ -5,10 +5,10 @@ Computation times ================= -**00:03.242** total execution time for **auto_examples_core** files: +**00:05.212** total execution time for **auto_examples_core** files: +--------------------------------------------------------------------------------------------------------------+-----------+---------+ -| :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` (``run_corpora_and_vector_spaces.py``) | 00:03.242 | 48.2 MB | +| :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` (``run_corpora_and_vector_spaces.py``) | 00:05.212 | 47.2 MB | +--------------------------------------------------------------------------------------------------------------+-----------+---------+ | :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` (``run_core_concepts.py``) | 00:00.000 | 0.0 MB | +--------------------------------------------------------------------------------------------------------------+-----------+---------+ diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst index 05643de00c..47819284b2 100644 --- a/docs/src/auto_examples/index.rst +++ b/docs/src/auto_examples/index.rst @@ -169,14 +169,14 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png - :alt: FastText Model + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png + :alt: Ensemble LDA - :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` + :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` .. raw:: html @@ -186,18 +186,18 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. toctree:: :hidden: - /auto_examples/tutorials/run_fasttext + /auto_examples/tutorials/run_ensemblelda .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png - :alt: Ensemble LDA + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png + :alt: FastText Model - :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` + :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` .. raw:: html @@ -207,7 +207,7 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. toctree:: :hidden: - /auto_examples/tutorials/run_ensemblelda + /auto_examples/tutorials/run_fasttext .. raw:: html @@ -253,14 +253,14 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_scm_thumb.png - :alt: Soft Cosine Measure + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png + :alt: Word Mover's Distance - :ref:`sphx_glr_auto_examples_tutorials_run_scm.py` + :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` .. raw:: html @@ -270,18 +270,18 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. toctree:: :hidden: - /auto_examples/tutorials/run_scm + /auto_examples/tutorials/run_wmd .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png - :alt: Word Mover's Distance + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_scm_thumb.png + :alt: Soft Cosine Measure - :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` + :ref:`sphx_glr_auto_examples_tutorials_run_scm.py` .. raw:: html @@ -291,7 +291,7 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. toctree:: :hidden: - /auto_examples/tutorials/run_wmd + /auto_examples/tutorials/run_scm .. raw:: html
diff --git a/docs/src/auto_examples/tutorials/run_lda.ipynb b/docs/src/auto_examples/tutorials/run_lda.ipynb index 363de86b07..b953fe872b 100644 --- a/docs/src/auto_examples/tutorials/run_lda.ipynb +++ b/docs/src/auto_examples/tutorials/run_lda.ipynb @@ -188,7 +188,7 @@ }, "outputs": [], "source": [ - "# Train LDA model.\nfrom gensim.models import LdaModel\n\n# Set training parameters.\nnum_topics = 10\nchunksize = 2000\npasses = 20\niterations = 400\neval_every = None # Don't evaluate model perplexity, takes too much time.\n\n# Make a index to word dictionary.\ntemp = dictionary[0] # This is only to \"load\" the dictionary.\nid2word = dictionary.id2token\n\nmodel = LdaModel(\n corpus=corpus,\n id2word=id2word,\n chunksize=chunksize,\n alpha='auto',\n eta='auto',\n iterations=iterations,\n num_topics=num_topics,\n passes=passes,\n eval_every=eval_every\n)" + "# Train LDA model.\nfrom gensim.models import LdaModel\n\n# Set training parameters.\nnum_topics = 10\nchunksize = 2000\npasses = 20\niterations = 400\neval_every = None # Don't evaluate model perplexity, takes too much time.\n\n# Make an index to word dictionary.\ntemp = dictionary[0] # This is only to \"load\" the dictionary.\nid2word = dictionary.id2token\n\nmodel = LdaModel(\n corpus=corpus,\n id2word=id2word,\n chunksize=chunksize,\n alpha='auto',\n eta='auto',\n iterations=iterations,\n num_topics=num_topics,\n passes=passes,\n eval_every=eval_every\n)" ] }, { @@ -206,7 +206,7 @@ }, "outputs": [], "source": [ - "top_topics = model.top_topics(corpus) #, num_words=20)\n\n# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.\navg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics\nprint('Average topic coherence: %.4f.' % avg_topic_coherence)\n\nfrom pprint import pprint\npprint(top_topics)" + "top_topics = model.top_topics(corpus)\n\n# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.\navg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics\nprint('Average topic coherence: %.4f.' % avg_topic_coherence)\n\nfrom pprint import pprint\npprint(top_topics)" ] }, { @@ -233,7 +233,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.0" + "version": "3.9.7" } }, "nbformat": 4, diff --git a/docs/src/auto_examples/tutorials/run_lda.py b/docs/src/auto_examples/tutorials/run_lda.py index 2ec06a801c..7ee6b07cd2 100644 --- a/docs/src/auto_examples/tutorials/run_lda.py +++ b/docs/src/auto_examples/tutorials/run_lda.py @@ -245,7 +245,7 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz' iterations = 400 eval_every = None # Don't evaluate model perplexity, takes too much time. -# Make a index to word dictionary. +# Make an index to word dictionary. temp = dictionary[0] # This is only to "load" the dictionary. id2word = dictionary.id2token @@ -278,7 +278,7 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz' # methods on the blog at http://rare-technologies.com/lda-training-tips/ ! # -top_topics = model.top_topics(corpus) #, num_words=20) +top_topics = model.top_topics(corpus) # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics diff --git a/docs/src/auto_examples/tutorials/run_lda.py.md5 b/docs/src/auto_examples/tutorials/run_lda.py.md5 index 9d25508c2f..6ce0e72960 100644 --- a/docs/src/auto_examples/tutorials/run_lda.py.md5 +++ b/docs/src/auto_examples/tutorials/run_lda.py.md5 @@ -1 +1 @@ -8e115014ecce36aa58a35f11fb525042 \ No newline at end of file +6733157cebb44ef13ae98ec8f4a533f1 \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_lda.rst b/docs/src/auto_examples/tutorials/run_lda.rst index 458fbee5c7..28c01ce172 100644 --- a/docs/src/auto_examples/tutorials/run_lda.rst +++ b/docs/src/auto_examples/tutorials/run_lda.rst @@ -204,46 +204,6 @@ don't tend to be useful, and the dataset contains a lot of them. -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/feature_extraction/image.py:167: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - dtype=np.int): - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:35: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - eps=np.finfo(np.float).eps, - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:597: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:836: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:862: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - eps=np.finfo(np.float).eps, positive=False): - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1074: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - max_n_alphas=1000, n_jobs=1, eps=np.finfo(np.float).eps, - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1306: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - max_n_alphas=1000, n_jobs=1, eps=np.finfo(np.float).eps, - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1442: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - eps=np.finfo(np.float).eps, copy_X=True, positive=False): - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/linear_model/randomized_l1.py:152: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - precompute=False, eps=np.finfo(np.float).eps, - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/linear_model/randomized_l1.py:318: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - eps=np.finfo(np.float).eps, random_state=None, - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/linear_model/randomized_l1.py:575: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - eps=4 * np.finfo(np.float).eps, n_jobs=1, - @@ -315,13 +275,11 @@ original data, because we would like to keep the words "machine" and .. code-block:: none - /home/jonaschn/Projects/gensim/gensim/similarities/__init__.py:11: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning. - "The gensim.similarities.levenshtein submodule is disabled, because the optional " - 2021-03-19 14:09:53,817 : INFO : collecting all words and their counts - 2021-03-19 14:09:53,817 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types - 2021-03-19 14:09:59,172 : INFO : collected 1120198 token types (unigram + bigrams) from a corpus of 4629808 words and 1740 sentences - 2021-03-19 14:09:59,172 : INFO : merged Phrases<1120198 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000> - 2021-03-19 14:09:59,190 : INFO : Phrases lifecycle event {'msg': 'built Phrases<1120198 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000> in 5.36s', 'datetime': '2021-03-19T14:09:59.189253', 'gensim': '4.0.0.rc1', 'python': '3.7.0 (default, Jun 28 2018, 13:15:42) \n[GCC 7.2.0]', 'platform': 'Linux-4.15.0-136-generic-x86_64-with-debian-buster-sid', 'event': 'created'} + 2022-04-22 17:42:29,962 : INFO : collecting all words and their counts + 2022-04-22 17:42:29,963 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types + 2022-04-22 17:42:37,368 : INFO : collected 1120198 token types (unigram + bigrams) from a corpus of 4629808 words and 1740 sentences + 2022-04-22 17:42:37,368 : INFO : merged Phrases<1120198 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000> + 2022-04-22 17:42:37,426 : INFO : Phrases lifecycle event {'msg': 'built Phrases<1120198 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000> in 7.41s', 'datetime': '2022-04-22T17:42:37.369061', 'gensim': '4.1.3.dev0', 'python': '3.9.7 (default, Sep 3 2021, 12:37:55) \n[Clang 12.0.5 (clang-1205.0.22.9)]', 'platform': 'macOS-11.6.5-x86_64-i386-64bit', 'event': 'created'} @@ -358,12 +316,12 @@ frequency, or maybe combining that with this approach. .. code-block:: none - 2021-03-19 14:10:07,280 : INFO : adding document #0 to Dictionary(0 unique tokens: []) - 2021-03-19 14:10:09,906 : INFO : built Dictionary(79429 unique tokens: ['1ooooo', '1st', '25oo', '2o00', '4ooo']...) from 1740 documents (total 4953968 corpus positions) - 2021-03-19 14:10:09,906 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(79429 unique tokens: ['1ooooo', '1st', '25oo', '2o00', '4ooo']...) from 1740 documents (total 4953968 corpus positions)", 'datetime': '2021-03-19T14:10:09.906597', 'gensim': '4.0.0.rc1', 'python': '3.7.0 (default, Jun 28 2018, 13:15:42) \n[GCC 7.2.0]', 'platform': 'Linux-4.15.0-136-generic-x86_64-with-debian-buster-sid', 'event': 'created'} - 2021-03-19 14:10:10,101 : INFO : discarding 70785 tokens: [('1ooooo', 1), ('25oo', 2), ('2o00', 6), ('4ooo', 2), ('64k', 6), ('a', 1740), ('aaditional', 1), ('above', 1114), ('abstract', 1740), ('acase', 1)]... - 2021-03-19 14:10:10,102 : INFO : keeping 8644 tokens which were in no less than 20 and no more than 870 (=50.0%) documents - 2021-03-19 14:10:10,128 : INFO : resulting dictionary: Dictionary(8644 unique tokens: ['1st', '5oo', '7th', 'a2', 'a_well']...) + 2022-04-22 17:42:50,414 : INFO : adding document #0 to Dictionary<0 unique tokens: []> + 2022-04-22 17:42:54,959 : INFO : built Dictionary<79429 unique tokens: ['1ooooo', '1st', '25oo', '2o00', '4ooo']...> from 1740 documents (total 4953968 corpus positions) + 2022-04-22 17:42:54,960 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<79429 unique tokens: ['1ooooo', '1st', '25oo', '2o00', '4ooo']...> from 1740 documents (total 4953968 corpus positions)", 'datetime': '2022-04-22T17:42:54.960496', 'gensim': '4.1.3.dev0', 'python': '3.9.7 (default, Sep 3 2021, 12:37:55) \n[Clang 12.0.5 (clang-1205.0.22.9)]', 'platform': 'macOS-11.6.5-x86_64-i386-64bit', 'event': 'created'} + 2022-04-22 17:42:55,733 : INFO : discarding 70785 tokens: [('1ooooo', 1), ('25oo', 2), ('2o00', 6), ('4ooo', 2), ('64k', 6), ('a', 1740), ('aaditional', 1), ('above', 1114), ('abstract', 1740), ('acase', 1)]... + 2022-04-22 17:42:55,734 : INFO : keeping 8644 tokens which were in no less than 20 and no more than 870 (=50.0%) documents + 2022-04-22 17:42:55,779 : INFO : resulting dictionary: Dictionary<8644 unique tokens: ['1st', '5oo', '7th', 'a2', 'a_well']...> @@ -479,7 +437,7 @@ the model that we usually would have to specify explicitly. iterations = 400 eval_every = None # Don't evaluate model perplexity, takes too much time. - # Make a index to word dictionary. + # Make an index to word dictionary. temp = dictionary[0] # This is only to "load" the dictionary. id2word = dictionary.id2token @@ -505,170 +463,170 @@ the model that we usually would have to specify explicitly. .. code-block:: none - 2021-03-19 14:10:12,273 : INFO : using autotuned alpha, starting with [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1] - 2021-03-19 14:10:12,278 : INFO : using serial LDA version on this node - 2021-03-19 14:10:12,478 : INFO : running online (multi-pass) LDA training, 10 topics, 20 passes over the supplied corpus of 1740 documents, updating model once every 1740 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000 - 2021-03-19 14:10:12,482 : INFO : PROGRESS: pass 0, at document #1740/1740 - 2021-03-19 14:10:27,000 : INFO : optimized alpha [0.06386429, 0.07352975, 0.10417274, 0.09618805, 0.09326739, 0.07658379, 0.05232423, 0.09257348, 0.05156824, 0.064680815] - 2021-03-19 14:10:27,050 : INFO : topic #8 (0.052): 0.004*"layer" + 0.004*"action" + 0.003*"generalization" + 0.003*"image" + 0.002*"dynamic" + 0.002*"sample" + 0.002*"optimal" + 0.002*"matrix" + 0.002*"net" + 0.002*"classifier" - 2021-03-19 14:10:27,051 : INFO : topic #6 (0.052): 0.006*"image" + 0.005*"hidden" + 0.004*"recognition" + 0.003*"component" + 0.003*"field" + 0.003*"dynamic" + 0.002*"map" + 0.002*"solution" + 0.002*"net" + 0.002*"generalization" - 2021-03-19 14:10:27,051 : INFO : topic #4 (0.093): 0.004*"class" + 0.003*"rule" + 0.003*"hidden" + 0.003*"neuron" + 0.003*"layer" + 0.003*"field" + 0.002*"noise" + 0.002*"net" + 0.002*"image" + 0.002*"node" - 2021-03-19 14:10:27,051 : INFO : topic #3 (0.096): 0.006*"image" + 0.003*"gaussian" + 0.003*"layer" + 0.003*"neuron" + 0.003*"field" + 0.003*"matrix" + 0.003*"circuit" + 0.003*"class" + 0.002*"threshold" + 0.002*"recognition" - 2021-03-19 14:10:27,051 : INFO : topic #2 (0.104): 0.005*"neuron" + 0.004*"image" + 0.004*"control" + 0.004*"layer" + 0.004*"hidden" + 0.003*"recognition" + 0.003*"object" + 0.003*"signal" + 0.003*"response" + 0.003*"class" - 2021-03-19 14:10:27,051 : INFO : topic diff=1.190941, rho=1.000000 - 2021-03-19 14:10:27,063 : INFO : PROGRESS: pass 1, at document #1740/1740 - 2021-03-19 14:10:36,200 : INFO : optimized alpha [0.05691391, 0.05848132, 0.0764488, 0.07592632, 0.07411411, 0.06465285, 0.046124753, 0.06826302, 0.043833494, 0.05291034] - 2021-03-19 14:10:36,207 : INFO : topic #8 (0.044): 0.007*"action" + 0.004*"robot" + 0.004*"control" + 0.003*"optimal" + 0.003*"policy" + 0.003*"reinforcement" + 0.003*"generalization" + 0.003*"dynamic" + 0.003*"layer" + 0.003*"trajectory" - 2021-03-19 14:10:36,207 : INFO : topic #6 (0.046): 0.007*"image" + 0.007*"hidden" + 0.005*"recognition" + 0.003*"hidden_unit" + 0.003*"energy" + 0.003*"component" + 0.003*"map" + 0.003*"generalization" + 0.003*"net" + 0.003*"layer" - 2021-03-19 14:10:36,207 : INFO : topic #4 (0.074): 0.005*"class" + 0.004*"rule" + 0.003*"hidden" + 0.003*"layer" + 0.003*"net" + 0.003*"classifier" + 0.002*"node" + 0.002*"word" + 0.002*"context" + 0.002*"architecture" - 2021-03-19 14:10:36,207 : INFO : topic #3 (0.076): 0.007*"image" + 0.004*"circuit" + 0.003*"layer" + 0.003*"field" + 0.003*"analog" + 0.003*"chip" + 0.003*"threshold" + 0.003*"gaussian" + 0.003*"class" + 0.003*"matrix" - 2021-03-19 14:10:36,208 : INFO : topic #2 (0.076): 0.005*"control" + 0.005*"recognition" + 0.005*"image" + 0.005*"object" + 0.004*"speech" + 0.004*"layer" + 0.004*"signal" + 0.004*"neuron" + 0.004*"hidden" + 0.003*"word" - 2021-03-19 14:10:36,208 : INFO : topic diff=0.297702, rho=0.577350 - 2021-03-19 14:10:36,218 : INFO : PROGRESS: pass 2, at document #1740/1740 - 2021-03-19 14:10:43,026 : INFO : optimized alpha [0.05407287, 0.051192053, 0.06480061, 0.06461501, 0.06359977, 0.05890888, 0.042885136, 0.056735355, 0.039943077, 0.04743726] - 2021-03-19 14:10:43,033 : INFO : topic #8 (0.040): 0.008*"action" + 0.006*"control" + 0.005*"robot" + 0.005*"reinforcement" + 0.005*"policy" + 0.004*"optimal" + 0.004*"dynamic" + 0.003*"trajectory" + 0.003*"reinforcement_learning" + 0.003*"controller" - 2021-03-19 14:10:43,033 : INFO : topic #6 (0.043): 0.008*"image" + 0.008*"hidden" + 0.005*"recognition" + 0.004*"hidden_unit" + 0.003*"energy" + 0.003*"layer" + 0.003*"net" + 0.003*"generalization" + 0.003*"map" + 0.003*"solution" - 2021-03-19 14:10:43,034 : INFO : topic #4 (0.064): 0.005*"class" + 0.004*"rule" + 0.004*"hidden" + 0.004*"layer" + 0.003*"net" + 0.003*"classifier" + 0.003*"node" + 0.003*"word" + 0.003*"context" + 0.002*"architecture" - 2021-03-19 14:10:43,034 : INFO : topic #3 (0.065): 0.008*"image" + 0.004*"circuit" + 0.004*"chip" + 0.004*"analog" + 0.004*"threshold" + 0.004*"layer" + 0.003*"field" + 0.003*"node" + 0.003*"class" + 0.003*"net" - 2021-03-19 14:10:43,034 : INFO : topic #2 (0.065): 0.006*"recognition" + 0.006*"speech" + 0.005*"control" + 0.005*"object" + 0.005*"image" + 0.005*"layer" + 0.005*"signal" + 0.004*"word" + 0.004*"hidden" + 0.003*"classification" - 2021-03-19 14:10:43,034 : INFO : topic diff=0.256329, rho=0.500000 - 2021-03-19 14:10:43,044 : INFO : PROGRESS: pass 3, at document #1740/1740 - 2021-03-19 14:10:48,846 : INFO : optimized alpha [0.053115886, 0.046841364, 0.05838778, 0.05814584, 0.05758646, 0.05547897, 0.040862918, 0.05055692, 0.037515096, 0.044183854] - 2021-03-19 14:10:48,853 : INFO : topic #8 (0.038): 0.010*"action" + 0.008*"control" + 0.006*"reinforcement" + 0.006*"robot" + 0.005*"policy" + 0.005*"optimal" + 0.004*"controller" + 0.004*"dynamic" + 0.004*"reinforcement_learning" + 0.004*"trajectory" - 2021-03-19 14:10:48,853 : INFO : topic #6 (0.041): 0.009*"hidden" + 0.008*"image" + 0.006*"recognition" + 0.004*"hidden_unit" + 0.004*"layer" + 0.004*"energy" + 0.003*"net" + 0.003*"generalization" + 0.003*"field" + 0.003*"map" - 2021-03-19 14:10:48,853 : INFO : topic #4 (0.058): 0.005*"class" + 0.005*"hidden" + 0.004*"rule" + 0.004*"layer" + 0.004*"net" + 0.004*"classifier" + 0.003*"node" + 0.003*"propagation" + 0.003*"architecture" + 0.003*"context" - 2021-03-19 14:10:48,854 : INFO : topic #3 (0.058): 0.009*"image" + 0.005*"chip" + 0.005*"circuit" + 0.005*"analog" + 0.004*"threshold" + 0.004*"layer" + 0.003*"field" + 0.003*"bit" + 0.003*"node" + 0.003*"net" - 2021-03-19 14:10:48,854 : INFO : topic #2 (0.058): 0.007*"recognition" + 0.007*"speech" + 0.006*"object" + 0.006*"image" + 0.005*"word" + 0.005*"layer" + 0.005*"control" + 0.005*"signal" + 0.004*"hidden" + 0.003*"face" - 2021-03-19 14:10:48,854 : INFO : topic diff=0.230126, rho=0.447214 - 2021-03-19 14:10:48,864 : INFO : PROGRESS: pass 4, at document #1740/1740 - 2021-03-19 14:10:54,097 : INFO : optimized alpha [0.052869715, 0.044183813, 0.0546517, 0.054109406, 0.053801704, 0.053375203, 0.0394719, 0.04672288, 0.035995413, 0.04192354] - 2021-03-19 14:10:54,105 : INFO : topic #8 (0.036): 0.010*"action" + 0.010*"control" + 0.007*"reinforcement" + 0.006*"robot" + 0.006*"policy" + 0.005*"optimal" + 0.005*"controller" + 0.005*"dynamic" + 0.004*"reinforcement_learning" + 0.004*"trajectory" - 2021-03-19 14:10:54,105 : INFO : topic #6 (0.039): 0.009*"hidden" + 0.008*"image" + 0.006*"recognition" + 0.005*"hidden_unit" + 0.004*"layer" + 0.004*"energy" + 0.003*"net" + 0.003*"digit" + 0.003*"field" + 0.003*"generalization" - 2021-03-19 14:10:54,105 : INFO : topic #4 (0.054): 0.005*"class" + 0.005*"hidden" + 0.005*"rule" + 0.005*"net" + 0.005*"layer" + 0.004*"classifier" + 0.004*"node" + 0.003*"propagation" + 0.003*"architecture" + 0.003*"sequence" - 2021-03-19 14:10:54,106 : INFO : topic #3 (0.054): 0.009*"image" + 0.006*"chip" + 0.006*"circuit" + 0.006*"analog" + 0.004*"threshold" + 0.004*"layer" + 0.003*"field" + 0.003*"bit" + 0.003*"node" + 0.003*"net" - 2021-03-19 14:10:54,106 : INFO : topic #2 (0.055): 0.008*"recognition" + 0.008*"speech" + 0.007*"object" + 0.006*"word" + 0.006*"image" + 0.005*"layer" + 0.005*"signal" + 0.005*"control" + 0.004*"hidden" + 0.004*"face" - 2021-03-19 14:10:54,106 : INFO : topic diff=0.214075, rho=0.408248 - 2021-03-19 14:10:54,116 : INFO : PROGRESS: pass 5, at document #1740/1740 - 2021-03-19 14:10:59,195 : INFO : optimized alpha [0.05290075, 0.042460088, 0.052235015, 0.051339325, 0.05138389, 0.05190376, 0.038578223, 0.044312876, 0.035001513, 0.040355477] - 2021-03-19 14:10:59,202 : INFO : topic #8 (0.035): 0.011*"control" + 0.011*"action" + 0.007*"reinforcement" + 0.006*"policy" + 0.006*"robot" + 0.005*"controller" + 0.005*"optimal" + 0.005*"dynamic" + 0.005*"reinforcement_learning" + 0.005*"trajectory" - 2021-03-19 14:10:59,202 : INFO : topic #6 (0.039): 0.010*"hidden" + 0.008*"image" + 0.006*"recognition" + 0.005*"hidden_unit" + 0.005*"layer" + 0.004*"energy" + 0.004*"digit" + 0.004*"character" + 0.004*"net" + 0.003*"field" - 2021-03-19 14:10:59,203 : INFO : topic #5 (0.052): 0.021*"neuron" + 0.012*"cell" + 0.007*"response" + 0.007*"spike" + 0.006*"synaptic" + 0.006*"stimulus" + 0.005*"activity" + 0.005*"firing" + 0.005*"signal" + 0.004*"memory" - 2021-03-19 14:10:59,203 : INFO : topic #2 (0.052): 0.009*"recognition" + 0.008*"speech" + 0.007*"object" + 0.007*"word" + 0.006*"image" + 0.006*"signal" + 0.005*"layer" + 0.004*"hidden" + 0.004*"control" + 0.004*"face" - 2021-03-19 14:10:59,203 : INFO : topic #0 (0.053): 0.005*"gaussian" + 0.005*"noise" + 0.005*"matrix" + 0.005*"hidden" + 0.004*"approximation" + 0.004*"sample" + 0.004*"estimate" + 0.004*"variance" + 0.004*"bayesian" + 0.003*"prior" - 2021-03-19 14:10:59,203 : INFO : topic diff=0.202368, rho=0.377964 - 2021-03-19 14:10:59,214 : INFO : PROGRESS: pass 6, at document #1740/1740 - 2021-03-19 14:11:04,013 : INFO : optimized alpha [0.053310633, 0.041254587, 0.050613035, 0.04936813, 0.049790192, 0.05083673, 0.038025398, 0.042830754, 0.034370847, 0.039269455] - 2021-03-19 14:11:04,020 : INFO : topic #8 (0.034): 0.012*"control" + 0.011*"action" + 0.008*"reinforcement" + 0.007*"policy" + 0.006*"robot" + 0.006*"controller" + 0.005*"optimal" + 0.005*"dynamic" + 0.005*"trajectory" + 0.005*"reinforcement_learning" - 2021-03-19 14:11:04,020 : INFO : topic #6 (0.038): 0.010*"hidden" + 0.009*"image" + 0.006*"recognition" + 0.005*"hidden_unit" + 0.005*"layer" + 0.004*"energy" + 0.004*"character" + 0.004*"digit" + 0.004*"net" + 0.004*"field" - 2021-03-19 14:11:04,021 : INFO : topic #2 (0.051): 0.010*"recognition" + 0.009*"speech" + 0.007*"word" + 0.007*"object" + 0.007*"image" + 0.006*"signal" + 0.006*"layer" + 0.004*"hidden" + 0.004*"face" + 0.004*"classification" - 2021-03-19 14:11:04,021 : INFO : topic #5 (0.051): 0.021*"neuron" + 0.012*"cell" + 0.007*"response" + 0.007*"spike" + 0.006*"synaptic" + 0.006*"stimulus" + 0.006*"activity" + 0.005*"firing" + 0.005*"signal" + 0.004*"frequency" - 2021-03-19 14:11:04,021 : INFO : topic #0 (0.053): 0.006*"gaussian" + 0.005*"noise" + 0.005*"matrix" + 0.005*"hidden" + 0.004*"approximation" + 0.004*"estimate" + 0.004*"sample" + 0.004*"bayesian" + 0.004*"variance" + 0.004*"prior" - 2021-03-19 14:11:04,021 : INFO : topic diff=0.192693, rho=0.353553 - 2021-03-19 14:11:04,032 : INFO : PROGRESS: pass 7, at document #1740/1740 - 2021-03-19 14:11:08,718 : INFO : optimized alpha [0.053891532, 0.040544394, 0.049499568, 0.047873296, 0.04881682, 0.0500006, 0.037689965, 0.04181969, 0.03393164, 0.038607482] - 2021-03-19 14:11:08,725 : INFO : topic #8 (0.034): 0.013*"control" + 0.012*"action" + 0.008*"reinforcement" + 0.007*"policy" + 0.006*"robot" + 0.006*"controller" + 0.005*"dynamic" + 0.005*"optimal" + 0.005*"trajectory" + 0.005*"reinforcement_learning" - 2021-03-19 14:11:08,725 : INFO : topic #6 (0.038): 0.010*"hidden" + 0.009*"image" + 0.006*"recognition" + 0.005*"layer" + 0.005*"hidden_unit" + 0.005*"character" + 0.004*"energy" + 0.004*"digit" + 0.004*"net" + 0.004*"field" - 2021-03-19 14:11:08,726 : INFO : topic #2 (0.049): 0.011*"recognition" + 0.009*"speech" + 0.008*"word" + 0.007*"object" + 0.007*"image" + 0.006*"signal" + 0.006*"layer" + 0.004*"face" + 0.004*"hidden" + 0.004*"classification" - 2021-03-19 14:11:08,726 : INFO : topic #5 (0.050): 0.022*"neuron" + 0.012*"cell" + 0.007*"response" + 0.007*"spike" + 0.007*"synaptic" + 0.006*"stimulus" + 0.006*"activity" + 0.005*"firing" + 0.005*"signal" + 0.005*"frequency" - 2021-03-19 14:11:08,726 : INFO : topic #0 (0.054): 0.006*"gaussian" + 0.005*"noise" + 0.005*"matrix" + 0.004*"approximation" + 0.004*"hidden" + 0.004*"estimate" + 0.004*"sample" + 0.004*"bayesian" + 0.004*"variance" + 0.004*"prior" - 2021-03-19 14:11:08,726 : INFO : topic diff=0.183651, rho=0.333333 - 2021-03-19 14:11:08,737 : INFO : PROGRESS: pass 8, at document #1740/1740 - 2021-03-19 14:11:13,510 : INFO : optimized alpha [0.0545965, 0.040113404, 0.048812777, 0.0467447, 0.048271947, 0.049433745, 0.03755086, 0.04124074, 0.033623673, 0.038269136] - 2021-03-19 14:11:13,518 : INFO : topic #8 (0.034): 0.014*"control" + 0.012*"action" + 0.008*"reinforcement" + 0.007*"policy" + 0.006*"controller" + 0.006*"robot" + 0.006*"dynamic" + 0.006*"optimal" + 0.005*"trajectory" + 0.005*"reinforcement_learning" - 2021-03-19 14:11:13,518 : INFO : topic #6 (0.038): 0.010*"hidden" + 0.009*"image" + 0.006*"recognition" + 0.005*"layer" + 0.005*"hidden_unit" + 0.005*"character" + 0.004*"energy" + 0.004*"digit" + 0.004*"net" + 0.004*"field" - 2021-03-19 14:11:13,518 : INFO : topic #2 (0.049): 0.011*"recognition" + 0.009*"speech" + 0.008*"word" + 0.008*"object" + 0.007*"image" + 0.006*"signal" + 0.006*"layer" + 0.004*"face" + 0.004*"classification" + 0.004*"hidden" - 2021-03-19 14:11:13,518 : INFO : topic #5 (0.049): 0.022*"neuron" + 0.013*"cell" + 0.008*"response" + 0.007*"spike" + 0.007*"synaptic" + 0.006*"stimulus" + 0.006*"activity" + 0.006*"firing" + 0.005*"signal" + 0.005*"frequency" - 2021-03-19 14:11:13,519 : INFO : topic #0 (0.055): 0.006*"gaussian" + 0.005*"noise" + 0.005*"matrix" + 0.004*"approximation" + 0.004*"estimate" + 0.004*"hidden" + 0.004*"sample" + 0.004*"bayesian" + 0.004*"likelihood" + 0.004*"variance" - 2021-03-19 14:11:13,519 : INFO : topic diff=0.175043, rho=0.316228 - 2021-03-19 14:11:13,530 : INFO : PROGRESS: pass 9, at document #1740/1740 - 2021-03-19 14:11:18,487 : INFO : optimized alpha [0.055368014, 0.039957594, 0.048399936, 0.045934383, 0.04802085, 0.049097233, 0.037513737, 0.040929828, 0.0334422, 0.038141657] - 2021-03-19 14:11:18,495 : INFO : topic #8 (0.033): 0.014*"control" + 0.012*"action" + 0.008*"reinforcement" + 0.007*"policy" + 0.006*"controller" + 0.006*"robot" + 0.006*"dynamic" + 0.006*"optimal" + 0.005*"trajectory" + 0.005*"reinforcement_learning" - 2021-03-19 14:11:18,495 : INFO : topic #6 (0.038): 0.010*"hidden" + 0.009*"image" + 0.006*"layer" + 0.006*"recognition" + 0.005*"character" + 0.005*"hidden_unit" + 0.004*"digit" + 0.004*"energy" + 0.004*"field" + 0.004*"net" - 2021-03-19 14:11:18,496 : INFO : topic #2 (0.048): 0.012*"recognition" + 0.010*"speech" + 0.009*"word" + 0.008*"image" + 0.008*"object" + 0.006*"signal" + 0.006*"layer" + 0.004*"face" + 0.004*"classification" + 0.004*"trained" - 2021-03-19 14:11:18,496 : INFO : topic #5 (0.049): 0.022*"neuron" + 0.013*"cell" + 0.008*"spike" + 0.008*"response" + 0.007*"synaptic" + 0.006*"stimulus" + 0.006*"activity" + 0.006*"firing" + 0.005*"signal" + 0.005*"frequency" - 2021-03-19 14:11:18,496 : INFO : topic #0 (0.055): 0.006*"gaussian" + 0.005*"noise" + 0.005*"matrix" + 0.005*"estimate" + 0.005*"approximation" + 0.004*"hidden" + 0.004*"sample" + 0.004*"bayesian" + 0.004*"likelihood" + 0.004*"prior" - 2021-03-19 14:11:18,496 : INFO : topic diff=0.166410, rho=0.301511 - 2021-03-19 14:11:18,507 : INFO : PROGRESS: pass 10, at document #1740/1740 - 2021-03-19 14:11:23,641 : INFO : optimized alpha [0.056234606, 0.039904997, 0.04814231, 0.045396697, 0.048054837, 0.048870783, 0.037563145, 0.04080154, 0.03336996, 0.03815883] - 2021-03-19 14:11:23,650 : INFO : topic #8 (0.033): 0.015*"control" + 0.012*"action" + 0.008*"reinforcement" + 0.008*"policy" + 0.007*"controller" + 0.006*"robot" + 0.006*"dynamic" + 0.006*"optimal" + 0.005*"trajectory" + 0.005*"reinforcement_learning" - 2021-03-19 14:11:23,651 : INFO : topic #6 (0.038): 0.010*"hidden" + 0.009*"image" + 0.006*"layer" + 0.006*"character" + 0.005*"recognition" + 0.005*"hidden_unit" + 0.004*"digit" + 0.004*"energy" + 0.004*"field" + 0.004*"net" - 2021-03-19 14:11:23,651 : INFO : topic #2 (0.048): 0.012*"recognition" + 0.010*"speech" + 0.009*"word" + 0.008*"image" + 0.008*"object" + 0.006*"signal" + 0.006*"layer" + 0.005*"face" + 0.004*"classification" + 0.004*"trained" - 2021-03-19 14:11:23,651 : INFO : topic #5 (0.049): 0.023*"neuron" + 0.013*"cell" + 0.008*"spike" + 0.008*"response" + 0.007*"synaptic" + 0.006*"activity" + 0.006*"stimulus" + 0.006*"firing" + 0.005*"signal" + 0.005*"frequency" - 2021-03-19 14:11:23,651 : INFO : topic #0 (0.056): 0.006*"gaussian" + 0.005*"noise" + 0.005*"estimate" + 0.005*"matrix" + 0.005*"approximation" + 0.004*"bayesian" + 0.004*"likelihood" + 0.004*"sample" + 0.004*"hidden" + 0.004*"prior" - 2021-03-19 14:11:23,651 : INFO : topic diff=0.157726, rho=0.288675 - 2021-03-19 14:11:23,663 : INFO : PROGRESS: pass 11, at document #1740/1740 - 2021-03-19 14:11:28,247 : INFO : optimized alpha [0.05706192, 0.039978355, 0.04797657, 0.044978894, 0.048209604, 0.048704833, 0.03767563, 0.04074631, 0.033347335, 0.038310345] - 2021-03-19 14:11:28,255 : INFO : topic #8 (0.033): 0.015*"control" + 0.012*"action" + 0.008*"reinforcement" + 0.008*"policy" + 0.007*"controller" + 0.006*"robot" + 0.006*"dynamic" + 0.006*"optimal" + 0.006*"trajectory" + 0.005*"reinforcement_learning" - 2021-03-19 14:11:28,256 : INFO : topic #6 (0.038): 0.010*"hidden" + 0.009*"image" + 0.006*"layer" + 0.006*"character" + 0.005*"recognition" + 0.005*"hidden_unit" + 0.004*"digit" + 0.004*"energy" + 0.004*"field" + 0.004*"net" - 2021-03-19 14:11:28,256 : INFO : topic #4 (0.048): 0.008*"hidden" + 0.007*"net" + 0.006*"layer" + 0.006*"rule" + 0.005*"node" + 0.004*"classifier" + 0.004*"hidden_unit" + 0.004*"class" + 0.004*"propagation" + 0.004*"sequence" - 2021-03-19 14:11:28,256 : INFO : topic #5 (0.049): 0.023*"neuron" + 0.013*"cell" + 0.008*"spike" + 0.008*"response" + 0.007*"synaptic" + 0.006*"activity" + 0.006*"firing" + 0.006*"stimulus" + 0.005*"signal" + 0.005*"frequency" - 2021-03-19 14:11:28,256 : INFO : topic #0 (0.057): 0.006*"gaussian" + 0.006*"noise" + 0.005*"estimate" + 0.005*"matrix" + 0.005*"approximation" + 0.004*"likelihood" + 0.004*"bayesian" + 0.004*"prior" + 0.004*"sample" + 0.004*"hidden" - 2021-03-19 14:11:28,256 : INFO : topic diff=0.149091, rho=0.277350 - 2021-03-19 14:11:28,268 : INFO : PROGRESS: pass 12, at document #1740/1740 - 2021-03-19 14:11:32,844 : INFO : optimized alpha [0.057841934, 0.040147286, 0.047984846, 0.04466845, 0.048510514, 0.048608452, 0.037831437, 0.04078982, 0.03338453, 0.038538743] - 2021-03-19 14:11:32,852 : INFO : topic #8 (0.033): 0.015*"control" + 0.012*"action" + 0.008*"reinforcement" + 0.008*"policy" + 0.007*"controller" + 0.006*"dynamic" + 0.006*"robot" + 0.006*"optimal" + 0.006*"trajectory" + 0.005*"reinforcement_learning" - 2021-03-19 14:11:32,852 : INFO : topic #6 (0.038): 0.010*"hidden" + 0.009*"image" + 0.006*"layer" + 0.006*"character" + 0.005*"recognition" + 0.005*"hidden_unit" + 0.005*"digit" + 0.004*"energy" + 0.004*"attractor" + 0.004*"field" - 2021-03-19 14:11:32,853 : INFO : topic #4 (0.049): 0.008*"hidden" + 0.007*"net" + 0.006*"layer" + 0.006*"rule" + 0.005*"node" + 0.004*"hidden_unit" + 0.004*"classifier" + 0.004*"class" + 0.004*"propagation" + 0.004*"sequence" - 2021-03-19 14:11:32,853 : INFO : topic #5 (0.049): 0.023*"neuron" + 0.013*"cell" + 0.008*"spike" + 0.008*"response" + 0.007*"synaptic" + 0.006*"firing" + 0.006*"activity" + 0.006*"stimulus" + 0.005*"signal" + 0.005*"frequency" - 2021-03-19 14:11:32,853 : INFO : topic #0 (0.058): 0.006*"gaussian" + 0.006*"noise" + 0.005*"estimate" + 0.005*"approximation" + 0.005*"matrix" + 0.004*"likelihood" + 0.004*"bayesian" + 0.004*"prior" + 0.004*"variance" + 0.004*"sample" - 2021-03-19 14:11:32,853 : INFO : topic diff=0.140596, rho=0.267261 - 2021-03-19 14:11:32,865 : INFO : PROGRESS: pass 13, at document #1740/1740 - 2021-03-19 14:11:37,447 : INFO : optimized alpha [0.058551796, 0.040399875, 0.048106886, 0.044424307, 0.04896659, 0.04858641, 0.03804483, 0.040931225, 0.03344661, 0.038809597] - 2021-03-19 14:11:37,455 : INFO : topic #8 (0.033): 0.016*"control" + 0.013*"action" + 0.008*"policy" + 0.008*"reinforcement" + 0.007*"controller" + 0.006*"dynamic" + 0.006*"robot" + 0.006*"optimal" + 0.006*"trajectory" + 0.005*"reinforcement_learning" - 2021-03-19 14:11:37,455 : INFO : topic #6 (0.038): 0.010*"hidden" + 0.009*"image" + 0.006*"layer" + 0.006*"character" + 0.005*"hidden_unit" + 0.005*"recognition" + 0.005*"digit" + 0.004*"energy" + 0.004*"attractor" + 0.004*"field" - 2021-03-19 14:11:37,456 : INFO : topic #5 (0.049): 0.023*"neuron" + 0.013*"cell" + 0.008*"spike" + 0.008*"response" + 0.007*"synaptic" + 0.006*"firing" + 0.006*"activity" + 0.006*"stimulus" + 0.005*"signal" + 0.005*"frequency" - 2021-03-19 14:11:37,456 : INFO : topic #4 (0.049): 0.008*"hidden" + 0.007*"net" + 0.006*"layer" + 0.006*"rule" + 0.006*"node" + 0.005*"hidden_unit" + 0.004*"classifier" + 0.004*"class" + 0.004*"sequence" + 0.004*"propagation" - 2021-03-19 14:11:37,456 : INFO : topic #0 (0.059): 0.007*"gaussian" + 0.006*"noise" + 0.005*"estimate" + 0.005*"approximation" + 0.005*"matrix" + 0.005*"likelihood" + 0.004*"bayesian" + 0.004*"prior" + 0.004*"variance" + 0.004*"sample" - 2021-03-19 14:11:37,456 : INFO : topic diff=0.132327, rho=0.258199 - 2021-03-19 14:11:37,467 : INFO : PROGRESS: pass 14, at document #1740/1740 - 2021-03-19 14:11:41,536 : INFO : optimized alpha [0.05925279, 0.040705983, 0.04832607, 0.04427085, 0.049501013, 0.048644915, 0.038285527, 0.04113948, 0.03352695, 0.039150245] - 2021-03-19 14:11:41,544 : INFO : topic #8 (0.034): 0.016*"control" + 0.013*"action" + 0.009*"policy" + 0.008*"reinforcement" + 0.007*"controller" + 0.006*"dynamic" + 0.006*"robot" + 0.006*"optimal" + 0.006*"trajectory" + 0.005*"reinforcement_learning" - 2021-03-19 14:11:41,544 : INFO : topic #6 (0.038): 0.010*"hidden" + 0.009*"image" + 0.006*"character" + 0.006*"layer" + 0.005*"hidden_unit" + 0.005*"recognition" + 0.005*"digit" + 0.004*"energy" + 0.004*"attractor" + 0.004*"net" - 2021-03-19 14:11:41,544 : INFO : topic #5 (0.049): 0.023*"neuron" + 0.013*"cell" + 0.008*"spike" + 0.008*"response" + 0.007*"synaptic" + 0.006*"firing" + 0.006*"activity" + 0.006*"stimulus" + 0.005*"signal" + 0.005*"frequency" - 2021-03-19 14:11:41,545 : INFO : topic #4 (0.050): 0.008*"hidden" + 0.008*"net" + 0.006*"layer" + 0.006*"rule" + 0.006*"node" + 0.005*"hidden_unit" + 0.004*"sequence" + 0.004*"propagation" + 0.004*"architecture" + 0.004*"activation" - 2021-03-19 14:11:41,545 : INFO : topic #0 (0.059): 0.007*"gaussian" + 0.006*"noise" + 0.005*"estimate" + 0.005*"approximation" + 0.005*"likelihood" + 0.005*"matrix" + 0.004*"prior" + 0.004*"bayesian" + 0.004*"variance" + 0.004*"density" - 2021-03-19 14:11:41,545 : INFO : topic diff=0.124371, rho=0.250000 - 2021-03-19 14:11:41,556 : INFO : PROGRESS: pass 15, at document #1740/1740 - 2021-03-19 14:11:45,592 : INFO : optimized alpha [0.05994643, 0.041028578, 0.048593685, 0.04419364, 0.05009154, 0.048734292, 0.03856185, 0.041424613, 0.033627965, 0.039535556] - 2021-03-19 14:11:45,600 : INFO : topic #8 (0.034): 0.016*"control" + 0.013*"action" + 0.009*"policy" + 0.008*"reinforcement" + 0.007*"controller" + 0.007*"dynamic" + 0.006*"robot" + 0.006*"optimal" + 0.006*"trajectory" + 0.005*"reinforcement_learning" - 2021-03-19 14:11:45,600 : INFO : topic #6 (0.039): 0.010*"hidden" + 0.009*"image" + 0.006*"character" + 0.006*"layer" + 0.005*"hidden_unit" + 0.005*"recognition" + 0.005*"digit" + 0.004*"energy" + 0.004*"attractor" + 0.004*"dynamic" - 2021-03-19 14:11:45,600 : INFO : topic #5 (0.049): 0.023*"neuron" + 0.014*"cell" + 0.008*"spike" + 0.008*"response" + 0.008*"synaptic" + 0.006*"firing" + 0.006*"activity" + 0.006*"stimulus" + 0.005*"signal" + 0.005*"frequency" - 2021-03-19 14:11:45,600 : INFO : topic #4 (0.050): 0.008*"hidden" + 0.008*"net" + 0.007*"layer" + 0.006*"rule" + 0.006*"node" + 0.005*"hidden_unit" + 0.004*"sequence" + 0.004*"architecture" + 0.004*"propagation" + 0.004*"activation" - 2021-03-19 14:11:45,601 : INFO : topic #0 (0.060): 0.007*"gaussian" + 0.006*"noise" + 0.005*"estimate" + 0.005*"likelihood" + 0.005*"approximation" + 0.005*"matrix" + 0.004*"prior" + 0.004*"bayesian" + 0.004*"variance" + 0.004*"density" - 2021-03-19 14:11:45,601 : INFO : topic diff=0.116794, rho=0.242536 - 2021-03-19 14:11:45,611 : INFO : PROGRESS: pass 16, at document #1740/1740 - 2021-03-19 14:11:49,737 : INFO : optimized alpha [0.06068379, 0.041378528, 0.048856508, 0.0441432, 0.05072476, 0.0488511, 0.038870405, 0.041741073, 0.03375229, 0.039979585] - 2021-03-19 14:11:49,745 : INFO : topic #8 (0.034): 0.016*"control" + 0.013*"action" + 0.009*"policy" + 0.008*"reinforcement" + 0.007*"controller" + 0.007*"dynamic" + 0.006*"robot" + 0.006*"optimal" + 0.006*"trajectory" + 0.006*"reinforcement_learning" - 2021-03-19 14:11:49,745 : INFO : topic #6 (0.039): 0.010*"hidden" + 0.009*"image" + 0.006*"character" + 0.006*"layer" + 0.005*"hidden_unit" + 0.005*"recognition" + 0.005*"digit" + 0.004*"energy" + 0.004*"attractor" + 0.004*"dynamic" - 2021-03-19 14:11:49,745 : INFO : topic #5 (0.049): 0.023*"neuron" + 0.014*"cell" + 0.008*"spike" + 0.008*"response" + 0.008*"synaptic" + 0.006*"firing" + 0.006*"activity" + 0.006*"stimulus" + 0.006*"signal" + 0.005*"frequency" - 2021-03-19 14:11:49,746 : INFO : topic #4 (0.051): 0.008*"hidden" + 0.008*"net" + 0.007*"layer" + 0.006*"rule" + 0.006*"node" + 0.005*"hidden_unit" + 0.004*"sequence" + 0.004*"architecture" + 0.004*"activation" + 0.004*"propagation" - 2021-03-19 14:11:49,746 : INFO : topic #0 (0.061): 0.007*"gaussian" + 0.006*"noise" + 0.005*"estimate" + 0.005*"likelihood" + 0.005*"approximation" + 0.005*"prior" + 0.004*"bayesian" + 0.004*"matrix" + 0.004*"variance" + 0.004*"density" - 2021-03-19 14:11:49,746 : INFO : topic diff=0.109661, rho=0.235702 - 2021-03-19 14:11:49,756 : INFO : PROGRESS: pass 17, at document #1740/1740 - 2021-03-19 14:11:53,841 : INFO : optimized alpha [0.061406724, 0.04174132, 0.0491224, 0.044116188, 0.05141323, 0.049025778, 0.03920408, 0.04207979, 0.033907466, 0.04045379] - 2021-03-19 14:11:53,850 : INFO : topic #8 (0.034): 0.016*"control" + 0.013*"action" + 0.009*"policy" + 0.008*"reinforcement" + 0.007*"controller" + 0.007*"dynamic" + 0.006*"robot" + 0.006*"optimal" + 0.006*"trajectory" + 0.006*"reinforcement_learning" - 2021-03-19 14:11:53,850 : INFO : topic #6 (0.039): 0.010*"hidden" + 0.009*"image" + 0.007*"character" + 0.006*"layer" + 0.005*"hidden_unit" + 0.005*"recognition" + 0.005*"digit" + 0.004*"energy" + 0.004*"attractor" + 0.004*"dynamic" - 2021-03-19 14:11:53,850 : INFO : topic #2 (0.049): 0.014*"recognition" + 0.011*"speech" + 0.010*"word" + 0.010*"image" + 0.008*"object" + 0.006*"signal" + 0.005*"layer" + 0.005*"face" + 0.005*"classification" + 0.005*"trained" - 2021-03-19 14:11:53,851 : INFO : topic #4 (0.051): 0.009*"hidden" + 0.008*"net" + 0.007*"layer" + 0.006*"rule" + 0.006*"node" + 0.005*"hidden_unit" + 0.004*"architecture" + 0.004*"sequence" + 0.004*"activation" + 0.004*"propagation" - 2021-03-19 14:11:53,851 : INFO : topic #0 (0.061): 0.007*"gaussian" + 0.006*"noise" + 0.005*"estimate" + 0.005*"likelihood" + 0.005*"approximation" + 0.005*"prior" + 0.005*"bayesian" + 0.004*"matrix" + 0.004*"variance" + 0.004*"density" - 2021-03-19 14:11:53,851 : INFO : topic diff=0.102938, rho=0.229416 - 2021-03-19 14:11:53,862 : INFO : PROGRESS: pass 18, at document #1740/1740 - 2021-03-19 14:11:57,816 : INFO : optimized alpha [0.062154472, 0.042110436, 0.04939213, 0.044109803, 0.05212181, 0.049227104, 0.039544087, 0.04246847, 0.03410476, 0.040957462] - 2021-03-19 14:11:57,823 : INFO : topic #8 (0.034): 0.016*"control" + 0.013*"action" + 0.009*"policy" + 0.008*"reinforcement" + 0.007*"controller" + 0.007*"dynamic" + 0.006*"robot" + 0.006*"optimal" + 0.006*"trajectory" + 0.006*"reinforcement_learning" - 2021-03-19 14:11:57,824 : INFO : topic #6 (0.040): 0.010*"hidden" + 0.008*"image" + 0.007*"character" + 0.006*"layer" + 0.005*"hidden_unit" + 0.005*"recognition" + 0.005*"digit" + 0.004*"energy" + 0.004*"attractor" + 0.004*"dynamic" - 2021-03-19 14:11:57,824 : INFO : topic #2 (0.049): 0.014*"recognition" + 0.011*"speech" + 0.010*"word" + 0.010*"image" + 0.008*"object" + 0.006*"signal" + 0.005*"layer" + 0.005*"face" + 0.005*"classification" + 0.005*"trained" - 2021-03-19 14:11:57,824 : INFO : topic #4 (0.052): 0.009*"hidden" + 0.008*"net" + 0.007*"layer" + 0.006*"rule" + 0.006*"node" + 0.005*"hidden_unit" + 0.004*"architecture" + 0.004*"sequence" + 0.004*"activation" + 0.004*"propagation" - 2021-03-19 14:11:57,824 : INFO : topic #0 (0.062): 0.007*"gaussian" + 0.006*"noise" + 0.005*"estimate" + 0.005*"likelihood" + 0.005*"approximation" + 0.005*"prior" + 0.005*"bayesian" + 0.004*"matrix" + 0.004*"density" + 0.004*"variance" - 2021-03-19 14:11:57,825 : INFO : topic diff=0.096678, rho=0.223607 - 2021-03-19 14:11:57,835 : INFO : PROGRESS: pass 19, at document #1740/1740 - 2021-03-19 14:12:01,856 : INFO : optimized alpha [0.06292996, 0.04251684, 0.049703237, 0.044167582, 0.052860808, 0.049467582, 0.039925203, 0.042864826, 0.03433462, 0.0415304] - 2021-03-19 14:12:01,864 : INFO : topic #8 (0.034): 0.016*"control" + 0.013*"action" + 0.009*"policy" + 0.008*"reinforcement" + 0.007*"controller" + 0.007*"dynamic" + 0.006*"robot" + 0.006*"optimal" + 0.006*"trajectory" + 0.006*"reinforcement_learning" - 2021-03-19 14:12:01,864 : INFO : topic #6 (0.040): 0.010*"hidden" + 0.008*"image" + 0.007*"character" + 0.006*"layer" + 0.005*"hidden_unit" + 0.005*"recognition" + 0.005*"digit" + 0.004*"attractor" + 0.004*"energy" + 0.004*"dynamic" - 2021-03-19 14:12:01,864 : INFO : topic #2 (0.050): 0.014*"recognition" + 0.011*"speech" + 0.010*"word" + 0.010*"image" + 0.008*"object" + 0.006*"signal" + 0.005*"layer" + 0.005*"classification" + 0.005*"face" + 0.005*"trained" - 2021-03-19 14:12:01,865 : INFO : topic #4 (0.053): 0.009*"hidden" + 0.008*"net" + 0.007*"layer" + 0.006*"rule" + 0.006*"node" + 0.005*"hidden_unit" + 0.004*"architecture" + 0.004*"activation" + 0.004*"sequence" + 0.004*"propagation" - 2021-03-19 14:12:01,865 : INFO : topic #0 (0.063): 0.007*"gaussian" + 0.006*"noise" + 0.005*"estimate" + 0.005*"likelihood" + 0.005*"approximation" + 0.005*"prior" + 0.005*"bayesian" + 0.004*"density" + 0.004*"mixture" + 0.004*"variance" - 2021-03-19 14:12:01,865 : INFO : topic diff=0.090853, rho=0.218218 - 2021-03-19 14:12:01,877 : INFO : LdaModel lifecycle event {'msg': 'trained LdaModel(num_terms=8644, num_topics=10, decay=0.5, chunksize=2000) in 109.40s', 'datetime': '2021-03-19T14:12:01.877604', 'gensim': '4.0.0.rc1', 'python': '3.7.0 (default, Jun 28 2018, 13:15:42) \n[GCC 7.2.0]', 'platform': 'Linux-4.15.0-136-generic-x86_64-with-debian-buster-sid', 'event': 'created'} + 2022-04-22 17:43:05,111 : INFO : using autotuned alpha, starting with [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1] + 2022-04-22 17:43:05,115 : INFO : using serial LDA version on this node + 2022-04-22 17:43:05,137 : INFO : running online (multi-pass) LDA training, 10 topics, 20 passes over the supplied corpus of 1740 documents, updating model once every 1740 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000 + 2022-04-22 17:43:05,148 : INFO : PROGRESS: pass 0, at document #1740/1740 + 2022-04-22 17:43:21,190 : INFO : optimized alpha [0.0578294, 0.07125457, 0.07889137, 0.09016259, 0.077791244, 0.0792375, 0.097086295, 0.061600033, 0.095310934, 0.060617708] + 2022-04-22 17:43:21,202 : INFO : topic #0 (0.058): 0.007*"hidden" + 0.006*"word" + 0.005*"recognition" + 0.004*"gaussian" + 0.003*"hidden_unit" + 0.003*"rule" + 0.003*"component" + 0.003*"layer" + 0.003*"image" + 0.002*"connection" + 2022-04-22 17:43:21,202 : INFO : topic #9 (0.061): 0.015*"neuron" + 0.007*"cell" + 0.005*"signal" + 0.005*"spike" + 0.004*"layer" + 0.004*"response" + 0.004*"firing" + 0.004*"noise" + 0.003*"density" + 0.003*"hidden" + 2022-04-22 17:43:21,202 : INFO : topic #3 (0.090): 0.006*"image" + 0.005*"class" + 0.003*"classifier" + 0.003*"classification" + 0.003*"recognition" + 0.003*"component" + 0.003*"kernel" + 0.003*"noise" + 0.003*"sequence" + 0.002*"rule" + 2022-04-22 17:43:21,203 : INFO : topic #8 (0.095): 0.004*"hidden" + 0.003*"signal" + 0.003*"rule" + 0.003*"dynamic" + 0.002*"control" + 0.002*"prediction" + 0.002*"net" + 0.002*"sequence" + 0.002*"speech" + 0.002*"matrix" + 2022-04-22 17:43:21,203 : INFO : topic #6 (0.097): 0.006*"image" + 0.005*"cell" + 0.004*"neuron" + 0.004*"layer" + 0.004*"field" + 0.004*"object" + 0.003*"recognition" + 0.003*"signal" + 0.003*"noise" + 0.003*"class" + 2022-04-22 17:43:21,203 : INFO : topic diff=1.159133, rho=1.000000 + 2022-04-22 17:43:21,212 : INFO : PROGRESS: pass 1, at document #1740/1740 + 2022-04-22 17:43:30,981 : INFO : optimized alpha [0.05010912, 0.057179544, 0.06367695, 0.07760008, 0.061386272, 0.06139503, 0.06987214, 0.050920427, 0.08028384, 0.05094144] + 2022-04-22 17:43:30,987 : INFO : topic #0 (0.050): 0.009*"word" + 0.009*"hidden" + 0.008*"recognition" + 0.005*"gaussian" + 0.005*"speech" + 0.004*"hidden_unit" + 0.004*"mixture" + 0.003*"layer" + 0.003*"component" + 0.003*"likelihood" + 2022-04-22 17:43:30,987 : INFO : topic #9 (0.051): 0.019*"neuron" + 0.009*"cell" + 0.009*"spike" + 0.007*"signal" + 0.006*"response" + 0.005*"firing" + 0.005*"stimulus" + 0.005*"noise" + 0.004*"layer" + 0.004*"visual" + 2022-04-22 17:43:30,987 : INFO : topic #6 (0.070): 0.007*"image" + 0.006*"cell" + 0.005*"object" + 0.005*"field" + 0.004*"motion" + 0.004*"visual" + 0.004*"signal" + 0.004*"direction" + 0.004*"layer" + 0.004*"filter" + 2022-04-22 17:43:30,988 : INFO : topic #3 (0.078): 0.008*"image" + 0.006*"class" + 0.005*"classifier" + 0.004*"classification" + 0.003*"kernel" + 0.003*"recognition" + 0.003*"component" + 0.003*"noise" + 0.003*"estimate" + 0.003*"gaussian" + 2022-04-22 17:43:30,988 : INFO : topic #8 (0.080): 0.004*"hidden" + 0.004*"rule" + 0.003*"sequence" + 0.003*"prediction" + 0.003*"net" + 0.003*"bound" + 0.003*"optimal" + 0.003*"signal" + 0.003*"dynamic" + 0.002*"hidden_unit" + 2022-04-22 17:43:30,988 : INFO : topic diff=0.292768, rho=0.577350 + 2022-04-22 17:43:30,996 : INFO : PROGRESS: pass 2, at document #1740/1740 + 2022-04-22 17:43:38,324 : INFO : optimized alpha [0.046267115, 0.049782153, 0.055386752, 0.070311576, 0.054385237, 0.052613482, 0.0592381, 0.044921257, 0.07121881, 0.045337107] + 2022-04-22 17:43:38,330 : INFO : topic #7 (0.045): 0.009*"chip" + 0.006*"analog" + 0.006*"neuron" + 0.006*"noise" + 0.006*"memory" + 0.005*"layer" + 0.004*"connection" + 0.004*"signal" + 0.004*"circuit" + 0.004*"image" + 2022-04-22 17:43:38,331 : INFO : topic #9 (0.045): 0.021*"neuron" + 0.011*"spike" + 0.011*"cell" + 0.007*"signal" + 0.007*"response" + 0.007*"stimulus" + 0.006*"firing" + 0.005*"noise" + 0.004*"visual" + 0.004*"layer" + 2022-04-22 17:43:38,331 : INFO : topic #6 (0.059): 0.009*"image" + 0.007*"object" + 0.006*"cell" + 0.006*"visual" + 0.006*"motion" + 0.005*"field" + 0.005*"direction" + 0.004*"filter" + 0.004*"signal" + 0.004*"response" + 2022-04-22 17:43:38,331 : INFO : topic #3 (0.070): 0.007*"image" + 0.007*"class" + 0.005*"classifier" + 0.004*"classification" + 0.003*"kernel" + 0.003*"sample" + 0.003*"estimate" + 0.003*"gaussian" + 0.003*"component" + 0.003*"noise" + 2022-04-22 17:43:38,331 : INFO : topic #8 (0.071): 0.005*"hidden" + 0.005*"rule" + 0.003*"sequence" + 0.003*"net" + 0.003*"bound" + 0.003*"prediction" + 0.003*"optimal" + 0.003*"generalization" + 0.003*"hidden_unit" + 0.002*"tree" + 2022-04-22 17:43:38,331 : INFO : topic diff=0.259048, rho=0.500000 + 2022-04-22 17:43:38,339 : INFO : PROGRESS: pass 3, at document #1740/1740 + 2022-04-22 17:43:44,815 : INFO : optimized alpha [0.04398281, 0.045212083, 0.050260257, 0.066244416, 0.050919566, 0.047668763, 0.053777307, 0.041211806, 0.06501518, 0.041524593] + 2022-04-22 17:43:44,821 : INFO : topic #7 (0.041): 0.010*"chip" + 0.007*"analog" + 0.007*"neuron" + 0.006*"memory" + 0.006*"noise" + 0.005*"circuit" + 0.005*"signal" + 0.005*"layer" + 0.004*"voltage" + 0.004*"connection" + 2022-04-22 17:43:44,821 : INFO : topic #9 (0.042): 0.021*"neuron" + 0.012*"spike" + 0.012*"cell" + 0.008*"signal" + 0.008*"stimulus" + 0.008*"response" + 0.007*"firing" + 0.005*"noise" + 0.004*"visual" + 0.004*"activity" + 2022-04-22 17:43:44,821 : INFO : topic #6 (0.054): 0.011*"image" + 0.008*"object" + 0.007*"visual" + 0.007*"motion" + 0.006*"field" + 0.006*"cell" + 0.005*"direction" + 0.005*"filter" + 0.004*"signal" + 0.004*"response" + 2022-04-22 17:43:44,822 : INFO : topic #8 (0.065): 0.005*"rule" + 0.005*"hidden" + 0.003*"sequence" + 0.003*"generalization" + 0.003*"net" + 0.003*"bound" + 0.003*"prediction" + 0.003*"hidden_unit" + 0.003*"optimal" + 0.003*"machine" + 2022-04-22 17:43:44,822 : INFO : topic #3 (0.066): 0.007*"image" + 0.007*"class" + 0.005*"classifier" + 0.005*"classification" + 0.004*"gaussian" + 0.004*"sample" + 0.003*"estimate" + 0.003*"kernel" + 0.003*"noise" + 0.003*"component" + 2022-04-22 17:43:44,822 : INFO : topic diff=0.235399, rho=0.447214 + 2022-04-22 17:43:44,830 : INFO : PROGRESS: pass 4, at document #1740/1740 + 2022-04-22 17:43:50,907 : INFO : optimized alpha [0.042409703, 0.0423433, 0.04680129, 0.06358971, 0.049375836, 0.044652227, 0.0507185, 0.038540646, 0.06110631, 0.038821314] + 2022-04-22 17:43:50,913 : INFO : topic #7 (0.039): 0.011*"chip" + 0.008*"analog" + 0.008*"neuron" + 0.007*"circuit" + 0.007*"memory" + 0.006*"noise" + 0.006*"signal" + 0.005*"voltage" + 0.005*"layer" + 0.004*"vlsi" + 2022-04-22 17:43:50,914 : INFO : topic #9 (0.039): 0.021*"neuron" + 0.013*"spike" + 0.013*"cell" + 0.009*"stimulus" + 0.009*"signal" + 0.009*"response" + 0.007*"firing" + 0.006*"noise" + 0.004*"activity" + 0.004*"visual" + 2022-04-22 17:43:50,914 : INFO : topic #6 (0.051): 0.013*"image" + 0.009*"object" + 0.008*"visual" + 0.007*"motion" + 0.007*"field" + 0.006*"cell" + 0.006*"direction" + 0.005*"filter" + 0.005*"response" + 0.004*"map" + 2022-04-22 17:43:50,914 : INFO : topic #8 (0.061): 0.006*"rule" + 0.005*"hidden" + 0.004*"generalization" + 0.004*"sequence" + 0.003*"net" + 0.003*"prediction" + 0.003*"hidden_unit" + 0.003*"bound" + 0.003*"machine" + 0.003*"tree" + 2022-04-22 17:43:50,914 : INFO : topic #3 (0.064): 0.007*"class" + 0.006*"image" + 0.005*"classifier" + 0.005*"classification" + 0.004*"gaussian" + 0.004*"sample" + 0.004*"estimate" + 0.003*"kernel" + 0.003*"density" + 0.003*"prior" + 2022-04-22 17:43:50,915 : INFO : topic diff=0.220905, rho=0.408248 + 2022-04-22 17:43:50,922 : INFO : PROGRESS: pass 5, at document #1740/1740 + 2022-04-22 17:43:57,459 : INFO : optimized alpha [0.04136415, 0.040443134, 0.04439863, 0.062082667, 0.048723623, 0.042787064, 0.048876576, 0.036657482, 0.058343116, 0.03701785] + 2022-04-22 17:43:57,465 : INFO : topic #7 (0.037): 0.012*"chip" + 0.009*"analog" + 0.008*"neuron" + 0.008*"circuit" + 0.007*"memory" + 0.006*"signal" + 0.006*"noise" + 0.006*"voltage" + 0.005*"vlsi" + 0.004*"layer" + 2022-04-22 17:43:57,465 : INFO : topic #9 (0.037): 0.022*"neuron" + 0.013*"spike" + 0.013*"cell" + 0.009*"stimulus" + 0.009*"signal" + 0.009*"response" + 0.008*"firing" + 0.006*"noise" + 0.004*"activity" + 0.004*"channel" + 2022-04-22 17:43:57,466 : INFO : topic #6 (0.049): 0.015*"image" + 0.010*"object" + 0.009*"visual" + 0.007*"motion" + 0.007*"field" + 0.006*"direction" + 0.006*"cell" + 0.005*"filter" + 0.005*"map" + 0.005*"response" + 2022-04-22 17:43:57,466 : INFO : topic #8 (0.058): 0.006*"rule" + 0.005*"hidden" + 0.004*"generalization" + 0.004*"sequence" + 0.003*"net" + 0.003*"hidden_unit" + 0.003*"prediction" + 0.003*"bound" + 0.003*"machine" + 0.003*"tree" + 2022-04-22 17:43:57,466 : INFO : topic #3 (0.062): 0.007*"class" + 0.006*"image" + 0.005*"classifier" + 0.005*"classification" + 0.004*"gaussian" + 0.004*"sample" + 0.004*"estimate" + 0.004*"density" + 0.004*"prior" + 0.003*"bayesian" + 2022-04-22 17:43:57,467 : INFO : topic diff=0.210451, rho=0.377964 + 2022-04-22 17:43:57,477 : INFO : PROGRESS: pass 6, at document #1740/1740 + 2022-04-22 17:44:02,657 : INFO : optimized alpha [0.040722344, 0.039083496, 0.04264549, 0.061218463, 0.048731733, 0.041630186, 0.047772773, 0.03532755, 0.0563227, 0.03579225] + 2022-04-22 17:44:02,663 : INFO : topic #7 (0.035): 0.012*"chip" + 0.010*"analog" + 0.009*"circuit" + 0.009*"neuron" + 0.007*"memory" + 0.007*"signal" + 0.006*"voltage" + 0.006*"noise" + 0.005*"vlsi" + 0.004*"implementation" + 2022-04-22 17:44:02,663 : INFO : topic #9 (0.036): 0.022*"neuron" + 0.014*"spike" + 0.013*"cell" + 0.010*"stimulus" + 0.009*"signal" + 0.009*"response" + 0.008*"firing" + 0.006*"noise" + 0.005*"channel" + 0.005*"activity" + 2022-04-22 17:44:02,664 : INFO : topic #4 (0.049): 0.008*"matrix" + 0.006*"gradient" + 0.005*"solution" + 0.004*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.003*"optimization" + 0.003*"neuron" + 0.003*"eq" + 2022-04-22 17:44:02,664 : INFO : topic #8 (0.056): 0.007*"rule" + 0.005*"hidden" + 0.004*"generalization" + 0.004*"sequence" + 0.004*"net" + 0.003*"hidden_unit" + 0.003*"prediction" + 0.003*"tree" + 0.003*"machine" + 0.003*"bound" + 2022-04-22 17:44:02,664 : INFO : topic #3 (0.061): 0.007*"class" + 0.005*"classifier" + 0.005*"image" + 0.005*"gaussian" + 0.005*"classification" + 0.004*"sample" + 0.004*"estimate" + 0.004*"density" + 0.004*"prior" + 0.004*"bayesian" + 2022-04-22 17:44:02,664 : INFO : topic diff=0.201353, rho=0.353553 + 2022-04-22 17:44:02,673 : INFO : PROGRESS: pass 7, at document #1740/1740 + 2022-04-22 17:44:08,716 : INFO : optimized alpha [0.040365368, 0.038083963, 0.041339714, 0.06076524, 0.04909782, 0.040898465, 0.047129765, 0.034341704, 0.054831598, 0.034885667] + 2022-04-22 17:44:08,722 : INFO : topic #7 (0.034): 0.013*"chip" + 0.010*"circuit" + 0.010*"analog" + 0.009*"neuron" + 0.007*"memory" + 0.007*"signal" + 0.007*"voltage" + 0.006*"noise" + 0.005*"vlsi" + 0.005*"implementation" + 2022-04-22 17:44:08,723 : INFO : topic #9 (0.035): 0.022*"neuron" + 0.014*"spike" + 0.014*"cell" + 0.010*"stimulus" + 0.010*"signal" + 0.010*"response" + 0.008*"firing" + 0.006*"noise" + 0.005*"channel" + 0.005*"activity" + 2022-04-22 17:44:08,723 : INFO : topic #4 (0.049): 0.009*"matrix" + 0.006*"gradient" + 0.005*"solution" + 0.004*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.003*"optimization" + 0.003*"eq" + 0.003*"neuron" + 2022-04-22 17:44:08,723 : INFO : topic #8 (0.055): 0.007*"rule" + 0.005*"hidden" + 0.005*"generalization" + 0.004*"sequence" + 0.004*"hidden_unit" + 0.004*"net" + 0.003*"prediction" + 0.003*"tree" + 0.003*"machine" + 0.003*"bound" + 2022-04-22 17:44:08,723 : INFO : topic #3 (0.061): 0.007*"class" + 0.005*"classifier" + 0.005*"gaussian" + 0.005*"classification" + 0.005*"sample" + 0.005*"image" + 0.004*"estimate" + 0.004*"density" + 0.004*"prior" + 0.004*"bayesian" + 2022-04-22 17:44:08,724 : INFO : topic diff=0.192330, rho=0.333333 + 2022-04-22 17:44:08,732 : INFO : PROGRESS: pass 8, at document #1740/1740 + 2022-04-22 17:44:13,585 : INFO : optimized alpha [0.040182494, 0.037441313, 0.04036209, 0.060601927, 0.049758103, 0.04055522, 0.046829112, 0.03359148, 0.053864058, 0.03418947] + 2022-04-22 17:44:13,591 : INFO : topic #7 (0.034): 0.013*"chip" + 0.011*"circuit" + 0.011*"analog" + 0.009*"neuron" + 0.007*"signal" + 0.007*"memory" + 0.007*"voltage" + 0.006*"vlsi" + 0.006*"noise" + 0.005*"implementation" + 2022-04-22 17:44:13,592 : INFO : topic #9 (0.034): 0.022*"neuron" + 0.014*"spike" + 0.014*"cell" + 0.010*"stimulus" + 0.010*"signal" + 0.010*"response" + 0.008*"firing" + 0.006*"noise" + 0.005*"channel" + 0.005*"frequency" + 2022-04-22 17:44:13,592 : INFO : topic #4 (0.050): 0.009*"matrix" + 0.006*"gradient" + 0.005*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.003*"optimization" + 0.003*"eq" + 0.003*"descent" + 2022-04-22 17:44:13,592 : INFO : topic #8 (0.054): 0.007*"rule" + 0.006*"hidden" + 0.005*"generalization" + 0.004*"sequence" + 0.004*"hidden_unit" + 0.004*"net" + 0.003*"prediction" + 0.003*"tree" + 0.003*"machine" + 0.003*"trained" + 2022-04-22 17:44:13,592 : INFO : topic #3 (0.061): 0.007*"class" + 0.005*"classifier" + 0.005*"gaussian" + 0.005*"sample" + 0.005*"classification" + 0.004*"estimate" + 0.004*"density" + 0.004*"image" + 0.004*"prior" + 0.004*"bayesian" + 2022-04-22 17:44:13,593 : INFO : topic diff=0.182985, rho=0.316228 + 2022-04-22 17:44:13,601 : INFO : PROGRESS: pass 9, at document #1740/1740 + 2022-04-22 17:44:19,306 : INFO : optimized alpha [0.040097952, 0.036957335, 0.039702885, 0.060680483, 0.050588053, 0.040437363, 0.046769954, 0.033025023, 0.053330485, 0.033663847] + 2022-04-22 17:44:19,312 : INFO : topic #7 (0.033): 0.013*"chip" + 0.012*"circuit" + 0.011*"analog" + 0.010*"neuron" + 0.008*"signal" + 0.007*"memory" + 0.007*"voltage" + 0.006*"vlsi" + 0.005*"noise" + 0.005*"implementation" + 2022-04-22 17:44:19,312 : INFO : topic #9 (0.034): 0.022*"neuron" + 0.014*"spike" + 0.014*"cell" + 0.010*"stimulus" + 0.010*"signal" + 0.010*"response" + 0.008*"firing" + 0.007*"noise" + 0.006*"channel" + 0.005*"frequency" + 2022-04-22 17:44:19,313 : INFO : topic #4 (0.051): 0.009*"matrix" + 0.006*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.004*"eq" + 0.003*"optimization" + 0.003*"descent" + 2022-04-22 17:44:19,313 : INFO : topic #8 (0.053): 0.008*"rule" + 0.006*"hidden" + 0.005*"generalization" + 0.004*"sequence" + 0.004*"hidden_unit" + 0.004*"net" + 0.004*"prediction" + 0.003*"tree" + 0.003*"machine" + 0.003*"trained" + 2022-04-22 17:44:19,313 : INFO : topic #3 (0.061): 0.007*"class" + 0.005*"gaussian" + 0.005*"classifier" + 0.005*"sample" + 0.005*"classification" + 0.005*"estimate" + 0.004*"density" + 0.004*"prior" + 0.004*"bayesian" + 0.004*"mixture" + 2022-04-22 17:44:19,313 : INFO : topic diff=0.173278, rho=0.301511 + 2022-04-22 17:44:19,321 : INFO : PROGRESS: pass 10, at document #1740/1740 + 2022-04-22 17:44:23,819 : INFO : optimized alpha [0.040098477, 0.036638554, 0.03923829, 0.060877353, 0.051485594, 0.04045682, 0.04686068, 0.032584008, 0.05302629, 0.03327818] + 2022-04-22 17:44:23,825 : INFO : topic #7 (0.033): 0.013*"chip" + 0.012*"circuit" + 0.011*"analog" + 0.010*"neuron" + 0.008*"signal" + 0.007*"memory" + 0.007*"voltage" + 0.006*"vlsi" + 0.005*"noise" + 0.005*"implementation" + 2022-04-22 17:44:23,825 : INFO : topic #9 (0.033): 0.021*"neuron" + 0.014*"spike" + 0.014*"cell" + 0.011*"stimulus" + 0.010*"signal" + 0.010*"response" + 0.008*"firing" + 0.007*"noise" + 0.006*"channel" + 0.006*"frequency" + 2022-04-22 17:44:23,826 : INFO : topic #4 (0.051): 0.009*"matrix" + 0.006*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.004*"eq" + 0.004*"optimization" + 0.003*"descent" + 2022-04-22 17:44:23,826 : INFO : topic #8 (0.053): 0.008*"rule" + 0.006*"hidden" + 0.005*"generalization" + 0.004*"hidden_unit" + 0.004*"sequence" + 0.004*"net" + 0.004*"prediction" + 0.004*"tree" + 0.003*"machine" + 0.003*"trained" + 2022-04-22 17:44:23,826 : INFO : topic #3 (0.061): 0.007*"class" + 0.006*"gaussian" + 0.005*"classifier" + 0.005*"sample" + 0.005*"estimate" + 0.005*"classification" + 0.004*"density" + 0.004*"prior" + 0.004*"mixture" + 0.004*"bayesian" + 2022-04-22 17:44:23,827 : INFO : topic diff=0.163348, rho=0.288675 + 2022-04-22 17:44:23,834 : INFO : PROGRESS: pass 11, at document #1740/1740 + 2022-04-22 17:44:29,135 : INFO : optimized alpha [0.040188633, 0.03646946, 0.038880475, 0.06112813, 0.05245481, 0.04061286, 0.047049697, 0.03229136, 0.05290524, 0.03296597] + 2022-04-22 17:44:29,141 : INFO : topic #7 (0.032): 0.013*"chip" + 0.013*"circuit" + 0.011*"analog" + 0.010*"neuron" + 0.008*"signal" + 0.007*"memory" + 0.007*"voltage" + 0.006*"vlsi" + 0.005*"noise" + 0.005*"implementation" + 2022-04-22 17:44:29,141 : INFO : topic #9 (0.033): 0.021*"neuron" + 0.014*"spike" + 0.014*"cell" + 0.011*"signal" + 0.011*"stimulus" + 0.010*"response" + 0.008*"firing" + 0.007*"noise" + 0.006*"frequency" + 0.006*"channel" + 2022-04-22 17:44:29,142 : INFO : topic #4 (0.052): 0.009*"matrix" + 0.006*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.004*"eq" + 0.004*"optimization" + 0.003*"optimal" + 2022-04-22 17:44:29,142 : INFO : topic #8 (0.053): 0.008*"rule" + 0.006*"hidden" + 0.005*"generalization" + 0.004*"hidden_unit" + 0.004*"sequence" + 0.004*"prediction" + 0.004*"net" + 0.004*"tree" + 0.003*"machine" + 0.003*"trained" + 2022-04-22 17:44:29,142 : INFO : topic #3 (0.061): 0.007*"class" + 0.006*"gaussian" + 0.005*"classifier" + 0.005*"sample" + 0.005*"estimate" + 0.005*"classification" + 0.005*"density" + 0.004*"prior" + 0.004*"mixture" + 0.004*"bayesian" + 2022-04-22 17:44:29,142 : INFO : topic diff=0.153485, rho=0.277350 + 2022-04-22 17:44:29,150 : INFO : PROGRESS: pass 12, at document #1740/1740 + 2022-04-22 17:44:33,545 : INFO : optimized alpha [0.04036388, 0.03635188, 0.038611963, 0.061483774, 0.05345723, 0.040894084, 0.04736741, 0.03211178, 0.05297828, 0.03274891] + 2022-04-22 17:44:33,551 : INFO : topic #7 (0.032): 0.013*"circuit" + 0.013*"chip" + 0.011*"analog" + 0.010*"neuron" + 0.008*"signal" + 0.007*"voltage" + 0.007*"memory" + 0.006*"vlsi" + 0.005*"implementation" + 0.005*"noise" + 2022-04-22 17:44:33,552 : INFO : topic #9 (0.033): 0.021*"neuron" + 0.014*"spike" + 0.014*"cell" + 0.011*"signal" + 0.011*"stimulus" + 0.011*"response" + 0.009*"firing" + 0.007*"noise" + 0.006*"frequency" + 0.006*"channel" + 2022-04-22 17:44:33,552 : INFO : topic #8 (0.053): 0.008*"rule" + 0.006*"hidden" + 0.006*"generalization" + 0.004*"hidden_unit" + 0.004*"sequence" + 0.004*"prediction" + 0.004*"net" + 0.004*"tree" + 0.003*"machine" + 0.003*"trained" + 2022-04-22 17:44:33,552 : INFO : topic #4 (0.053): 0.009*"matrix" + 0.006*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.004*"eq" + 0.003*"optimization" + 0.003*"optimal" + 2022-04-22 17:44:33,552 : INFO : topic #3 (0.061): 0.007*"class" + 0.006*"gaussian" + 0.005*"sample" + 0.005*"classifier" + 0.005*"estimate" + 0.005*"density" + 0.005*"classification" + 0.004*"prior" + 0.004*"mixture" + 0.004*"bayesian" + 2022-04-22 17:44:33,553 : INFO : topic diff=0.143831, rho=0.267261 + 2022-04-22 17:44:33,562 : INFO : PROGRESS: pass 13, at document #1740/1740 + 2022-04-22 17:44:39,235 : INFO : optimized alpha [0.040587135, 0.03631959, 0.03839379, 0.061911535, 0.05453887, 0.041285977, 0.047773384, 0.032027513, 0.05315258, 0.03261802] + 2022-04-22 17:44:39,246 : INFO : topic #7 (0.032): 0.014*"circuit" + 0.013*"chip" + 0.011*"analog" + 0.010*"neuron" + 0.008*"signal" + 0.008*"voltage" + 0.007*"memory" + 0.006*"vlsi" + 0.005*"implementation" + 0.005*"noise" + 2022-04-22 17:44:39,246 : INFO : topic #9 (0.033): 0.021*"neuron" + 0.014*"spike" + 0.014*"cell" + 0.011*"signal" + 0.011*"stimulus" + 0.011*"response" + 0.009*"firing" + 0.007*"noise" + 0.007*"frequency" + 0.006*"channel" + 2022-04-22 17:44:39,247 : INFO : topic #8 (0.053): 0.008*"rule" + 0.006*"hidden" + 0.006*"generalization" + 0.004*"hidden_unit" + 0.004*"sequence" + 0.004*"prediction" + 0.004*"net" + 0.004*"tree" + 0.004*"machine" + 0.003*"trained" + 2022-04-22 17:44:39,247 : INFO : topic #4 (0.055): 0.009*"matrix" + 0.007*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.004*"eq" + 0.003*"optimization" + 0.003*"optimal" + 2022-04-22 17:44:39,247 : INFO : topic #3 (0.062): 0.007*"class" + 0.006*"gaussian" + 0.005*"sample" + 0.005*"classifier" + 0.005*"estimate" + 0.005*"density" + 0.005*"classification" + 0.004*"prior" + 0.004*"mixture" + 0.004*"bayesian" + 2022-04-22 17:44:39,248 : INFO : topic diff=0.134602, rho=0.258199 + 2022-04-22 17:44:39,258 : INFO : PROGRESS: pass 14, at document #1740/1740 + 2022-04-22 17:44:46,319 : INFO : optimized alpha [0.040821876, 0.036360793, 0.03824259, 0.062456302, 0.055688635, 0.041737743, 0.048259463, 0.032020763, 0.05343126, 0.03254091] + 2022-04-22 17:44:46,325 : INFO : topic #7 (0.032): 0.014*"circuit" + 0.013*"chip" + 0.012*"analog" + 0.010*"neuron" + 0.008*"signal" + 0.008*"voltage" + 0.007*"memory" + 0.006*"vlsi" + 0.005*"implementation" + 0.005*"noise" + 2022-04-22 17:44:46,326 : INFO : topic #9 (0.033): 0.021*"neuron" + 0.014*"spike" + 0.014*"cell" + 0.011*"signal" + 0.011*"response" + 0.011*"stimulus" + 0.009*"firing" + 0.007*"noise" + 0.007*"frequency" + 0.006*"channel" + 2022-04-22 17:44:46,327 : INFO : topic #8 (0.053): 0.008*"rule" + 0.006*"hidden" + 0.006*"generalization" + 0.004*"hidden_unit" + 0.004*"sequence" + 0.004*"prediction" + 0.004*"net" + 0.004*"tree" + 0.004*"machine" + 0.003*"trained" + 2022-04-22 17:44:46,327 : INFO : topic #4 (0.056): 0.009*"matrix" + 0.007*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.004*"eq" + 0.004*"optimal" + 0.003*"optimization" + 2022-04-22 17:44:46,327 : INFO : topic #3 (0.062): 0.007*"class" + 0.006*"gaussian" + 0.005*"sample" + 0.005*"classifier" + 0.005*"estimate" + 0.005*"density" + 0.004*"mixture" + 0.004*"classification" + 0.004*"prior" + 0.004*"bayesian" + 2022-04-22 17:44:46,328 : INFO : topic diff=0.125871, rho=0.250000 + 2022-04-22 17:44:46,338 : INFO : PROGRESS: pass 15, at document #1740/1740 + 2022-04-22 17:44:53,655 : INFO : optimized alpha [0.04109236, 0.036467522, 0.0381424, 0.06306473, 0.056903645, 0.04227092, 0.04874864, 0.032058466, 0.053792715, 0.03251973] + 2022-04-22 17:44:53,666 : INFO : topic #7 (0.032): 0.014*"circuit" + 0.013*"chip" + 0.012*"analog" + 0.010*"neuron" + 0.008*"signal" + 0.008*"voltage" + 0.007*"memory" + 0.006*"vlsi" + 0.005*"implementation" + 0.005*"bit" + 2022-04-22 17:44:53,666 : INFO : topic #9 (0.033): 0.021*"neuron" + 0.014*"spike" + 0.014*"cell" + 0.011*"signal" + 0.011*"response" + 0.011*"stimulus" + 0.009*"firing" + 0.007*"frequency" + 0.007*"noise" + 0.006*"channel" + 2022-04-22 17:44:53,667 : INFO : topic #8 (0.054): 0.008*"rule" + 0.006*"hidden" + 0.006*"generalization" + 0.004*"hidden_unit" + 0.004*"prediction" + 0.004*"sequence" + 0.004*"net" + 0.004*"tree" + 0.004*"machine" + 0.003*"trained" + 2022-04-22 17:44:53,667 : INFO : topic #4 (0.057): 0.009*"matrix" + 0.007*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.004*"eq" + 0.004*"optimal" + 0.003*"optimization" + 2022-04-22 17:44:53,667 : INFO : topic #3 (0.063): 0.007*"class" + 0.006*"gaussian" + 0.005*"sample" + 0.005*"estimate" + 0.005*"classifier" + 0.005*"density" + 0.005*"mixture" + 0.005*"prior" + 0.004*"classification" + 0.004*"bayesian" + 2022-04-22 17:44:53,667 : INFO : topic diff=0.117670, rho=0.242536 + 2022-04-22 17:44:53,679 : INFO : PROGRESS: pass 16, at document #1740/1740 + 2022-04-22 17:45:00,393 : INFO : optimized alpha [0.041376065, 0.03660367, 0.0380804, 0.06374838, 0.058118302, 0.0428449, 0.049285352, 0.03212048, 0.054208644, 0.032528903] + 2022-04-22 17:45:00,403 : INFO : topic #7 (0.032): 0.014*"circuit" + 0.013*"chip" + 0.012*"analog" + 0.011*"neuron" + 0.008*"signal" + 0.008*"voltage" + 0.008*"memory" + 0.006*"vlsi" + 0.006*"implementation" + 0.005*"bit" + 2022-04-22 17:45:00,403 : INFO : topic #9 (0.033): 0.021*"neuron" + 0.014*"cell" + 0.014*"spike" + 0.012*"signal" + 0.011*"response" + 0.011*"stimulus" + 0.009*"firing" + 0.007*"frequency" + 0.007*"noise" + 0.007*"channel" + 2022-04-22 17:45:00,404 : INFO : topic #8 (0.054): 0.008*"rule" + 0.006*"hidden" + 0.006*"generalization" + 0.004*"hidden_unit" + 0.004*"prediction" + 0.004*"sequence" + 0.004*"net" + 0.004*"tree" + 0.004*"machine" + 0.003*"trained" + 2022-04-22 17:45:00,404 : INFO : topic #4 (0.058): 0.009*"matrix" + 0.007*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.004*"eq" + 0.004*"optimal" + 0.003*"optimization" + 2022-04-22 17:45:00,404 : INFO : topic #3 (0.064): 0.007*"class" + 0.006*"gaussian" + 0.006*"sample" + 0.005*"estimate" + 0.005*"classifier" + 0.005*"density" + 0.005*"mixture" + 0.005*"prior" + 0.004*"classification" + 0.004*"bayesian" + 2022-04-22 17:45:00,405 : INFO : topic diff=0.109988, rho=0.235702 + 2022-04-22 17:45:00,416 : INFO : PROGRESS: pass 17, at document #1740/1740 + 2022-04-22 17:45:09,386 : INFO : optimized alpha [0.041690826, 0.036777373, 0.038074017, 0.06447209, 0.059317604, 0.043464534, 0.04985148, 0.032209247, 0.05470903, 0.032565065] + 2022-04-22 17:45:09,400 : INFO : topic #7 (0.032): 0.014*"circuit" + 0.013*"chip" + 0.012*"analog" + 0.011*"neuron" + 0.008*"signal" + 0.008*"voltage" + 0.008*"memory" + 0.006*"vlsi" + 0.006*"implementation" + 0.005*"bit" + 2022-04-22 17:45:09,400 : INFO : topic #9 (0.033): 0.020*"neuron" + 0.014*"cell" + 0.014*"spike" + 0.012*"signal" + 0.011*"response" + 0.011*"stimulus" + 0.009*"firing" + 0.007*"frequency" + 0.007*"noise" + 0.007*"channel" + 2022-04-22 17:45:09,401 : INFO : topic #8 (0.055): 0.008*"rule" + 0.006*"hidden" + 0.006*"generalization" + 0.004*"hidden_unit" + 0.004*"prediction" + 0.004*"sequence" + 0.004*"net" + 0.004*"tree" + 0.004*"machine" + 0.003*"trained" + 2022-04-22 17:45:09,401 : INFO : topic #4 (0.059): 0.009*"matrix" + 0.007*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.004*"eq" + 0.004*"optimal" + 0.003*"optimization" + 2022-04-22 17:45:09,402 : INFO : topic #3 (0.064): 0.007*"class" + 0.006*"gaussian" + 0.006*"sample" + 0.005*"estimate" + 0.005*"classifier" + 0.005*"density" + 0.005*"mixture" + 0.005*"prior" + 0.004*"classification" + 0.004*"bayesian" + 2022-04-22 17:45:09,402 : INFO : topic diff=0.102916, rho=0.229416 + 2022-04-22 17:45:09,423 : INFO : PROGRESS: pass 18, at document #1740/1740 + 2022-04-22 17:45:19,067 : INFO : optimized alpha [0.042022552, 0.037017036, 0.038090236, 0.06523256, 0.06052085, 0.044076443, 0.050475497, 0.03232651, 0.055261094, 0.032642473] + 2022-04-22 17:45:19,077 : INFO : topic #7 (0.032): 0.015*"circuit" + 0.013*"chip" + 0.012*"analog" + 0.011*"neuron" + 0.008*"signal" + 0.008*"voltage" + 0.008*"memory" + 0.006*"vlsi" + 0.006*"implementation" + 0.005*"bit" + 2022-04-22 17:45:19,077 : INFO : topic #9 (0.033): 0.020*"neuron" + 0.014*"cell" + 0.014*"spike" + 0.012*"signal" + 0.011*"response" + 0.011*"stimulus" + 0.009*"firing" + 0.008*"frequency" + 0.007*"noise" + 0.007*"channel" + 2022-04-22 17:45:19,078 : INFO : topic #8 (0.055): 0.009*"rule" + 0.006*"hidden" + 0.006*"generalization" + 0.005*"hidden_unit" + 0.004*"prediction" + 0.004*"net" + 0.004*"sequence" + 0.004*"tree" + 0.004*"machine" + 0.003*"trained" + 2022-04-22 17:45:19,078 : INFO : topic #4 (0.061): 0.009*"matrix" + 0.007*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"minimum" + 0.004*"let" + 0.004*"eq" + 0.004*"optimal" + 0.003*"optimization" + 2022-04-22 17:45:19,078 : INFO : topic #3 (0.065): 0.007*"class" + 0.007*"gaussian" + 0.006*"sample" + 0.005*"estimate" + 0.005*"density" + 0.005*"classifier" + 0.005*"mixture" + 0.005*"prior" + 0.004*"classification" + 0.004*"likelihood" + 2022-04-22 17:45:19,079 : INFO : topic diff=0.096362, rho=0.223607 + 2022-04-22 17:45:19,090 : INFO : PROGRESS: pass 19, at document #1740/1740 + 2022-04-22 17:45:26,202 : INFO : optimized alpha [0.042380035, 0.037280142, 0.03813037, 0.06597655, 0.0617652, 0.044686105, 0.051100377, 0.032451425, 0.05581024, 0.03274816] + 2022-04-22 17:45:26,210 : INFO : topic #7 (0.032): 0.015*"circuit" + 0.013*"chip" + 0.012*"analog" + 0.011*"neuron" + 0.008*"signal" + 0.008*"voltage" + 0.008*"memory" + 0.006*"vlsi" + 0.006*"implementation" + 0.005*"bit" + 2022-04-22 17:45:26,210 : INFO : topic #9 (0.033): 0.020*"neuron" + 0.015*"cell" + 0.014*"spike" + 0.012*"signal" + 0.011*"response" + 0.011*"stimulus" + 0.009*"firing" + 0.008*"frequency" + 0.007*"noise" + 0.007*"channel" + 2022-04-22 17:45:26,210 : INFO : topic #8 (0.056): 0.009*"rule" + 0.006*"hidden" + 0.006*"generalization" + 0.005*"hidden_unit" + 0.004*"prediction" + 0.004*"net" + 0.004*"sequence" + 0.004*"tree" + 0.004*"machine" + 0.003*"trained" + 2022-04-22 17:45:26,211 : INFO : topic #4 (0.062): 0.009*"matrix" + 0.007*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"minimum" + 0.004*"let" + 0.004*"eq" + 0.004*"optimal" + 0.003*"energy" + 2022-04-22 17:45:26,211 : INFO : topic #3 (0.066): 0.007*"class" + 0.007*"gaussian" + 0.006*"sample" + 0.005*"estimate" + 0.005*"density" + 0.005*"mixture" + 0.005*"classifier" + 0.005*"prior" + 0.004*"likelihood" + 0.004*"bayesian" + 2022-04-22 17:45:26,211 : INFO : topic diff=0.090311, rho=0.218218 + 2022-04-22 17:45:26,222 : INFO : LdaModel lifecycle event {'msg': 'trained LdaModel in 141.08s', 'datetime': '2022-04-22T17:45:26.222157', 'gensim': '4.1.3.dev0', 'python': '3.9.7 (default, Sep 3 2021, 12:37:55) \n[Clang 12.0.5 (clang-1205.0.22.9)]', 'platform': 'macOS-11.6.5-x86_64-i386-64bit', 'event': 'created'} @@ -696,7 +654,7 @@ methods on the blog at http://rare-technologies.com/lda-training-tips/ ! .. code-block:: default - top_topics = model.top_topics(corpus) #, num_words=20) + top_topics = model.top_topics(corpus) # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics @@ -715,218 +673,218 @@ methods on the blog at http://rare-technologies.com/lda-training-tips/ ! .. code-block:: none - 2021-03-19 14:12:02,008 : INFO : CorpusAccumulator accumulated stats from 1000 documents - Average topic coherence: -1.1072. - [([(0.023360161, 'neuron'), - (0.013864572, 'cell'), - (0.0085508, 'spike'), - (0.007835109, 'response'), - (0.0077002184, 'synaptic'), - (0.006420619, 'firing'), - (0.0063291225, 'activity'), - (0.005894408, 'stimulus'), - (0.005635916, 'signal'), - (0.005319338, 'frequency'), - (0.0044079474, 'potential'), - (0.0042212, 'connection'), - (0.003969707, 'fig'), - (0.0038775448, 'phase'), - (0.0037467096, 'synapsis'), - (0.0035546266, 'channel'), - (0.0035464808, 'dynamic'), - (0.0035111816, 'memory'), - (0.003500412, 'simulation'), - (0.0033668294, 'temporal')], - -0.8843724877515563), - ([(0.007043698, 'gaussian'), - (0.0058810986, 'noise'), - (0.005357382, 'estimate'), - (0.005118217, 'likelihood'), - (0.004725707, 'approximation'), - (0.0047162576, 'prior'), - (0.004589121, 'bayesian'), - (0.0044163894, 'density'), - (0.004383228, 'mixture'), - (0.0043818722, 'variance'), - (0.004343727, 'matrix'), - (0.003920799, 'log'), - (0.0039041233, 'sample'), - (0.0038657538, 'posterior'), - (0.0038494268, 'hidden'), - (0.003747304, 'prediction'), - (0.0035524433, 'generalization'), - (0.003297515, 'em'), - (0.0031830291, 'optimal'), - (0.0029574349, 'estimation')], - -0.9201121458749306), - ([(0.013338742, 'visual'), - (0.011440194, 'cell'), - (0.010699649, 'field'), - (0.009350259, 'image'), - (0.008701173, 'motion'), - (0.008576538, 'map'), - (0.0077895345, 'direction'), - (0.0073878667, 'orientation'), - (0.006964441, 'eye'), - (0.0066007036, 'response'), - (0.0062312516, 'stimulus'), - (0.006194355, 'spatial'), - (0.0055934438, 'receptive'), - (0.005137706, 'receptive_field'), - (0.00512753, 'object'), - (0.004664231, 'layer'), - (0.0046304427, 'activity'), - (0.0045092506, 'position'), - (0.004168487, 'cortex'), - (0.0040872716, 'location')], - -0.9666086669197183), - ([(0.009677556, 'hidden'), - (0.008472348, 'image'), - (0.0066851787, 'character'), - (0.0064806826, 'layer'), - (0.005060741, 'hidden_unit'), - (0.004902215, 'recognition'), - (0.004825573, 'digit'), - (0.0043749292, 'attractor'), - (0.0043325345, 'energy'), - (0.00431843, 'dynamic'), - (0.0038877935, 'matrix'), - (0.003805258, 'net'), - (0.003757226, 'field'), - (0.0035065063, 'transformation'), - (0.0034933372, 'dimensional'), - (0.0034391459, 'distance'), - (0.0031490896, 'gradient'), - (0.0031419578, 'solution'), - (0.002954112, 'map'), - (0.0028736237, 'minimum')], - -1.011100924928429), - ([(0.010836434, 'circuit'), - (0.009359381, 'chip'), - (0.008903197, 'analog'), - (0.00655248, 'neuron'), - (0.006147317, 'threshold'), - (0.0050505013, 'image'), - (0.0048734145, 'bit'), - (0.0048433533, 'voltage'), - (0.004609887, 'memory'), - (0.004231914, 'vlsi'), - (0.0042090695, 'implementation'), - (0.004113957, 'net'), - (0.003907882, 'gate'), - (0.0038376434, 'layer'), - (0.0034949183, 'pp'), - (0.003291277, 'element'), - (0.0032199384, 'node'), - (0.0030992834, 'signal'), - (0.0029631325, 'design'), - (0.0028471586, 'processor')], - -1.0450720584710176), - ([(0.008781833, 'hidden'), - (0.008109003, 'net'), - (0.0069496827, 'layer'), - (0.006155399, 'rule'), - (0.005891262, 'node'), - (0.0051560537, 'hidden_unit'), - (0.0041502067, 'architecture'), - (0.0041317134, 'activation'), - (0.0041251457, 'sequence'), - (0.0040346556, 'propagation'), - (0.0036248995, 'back'), - (0.0035959794, 'recurrent'), - (0.0031377305, 'class'), - (0.0030542722, 'trained'), - (0.0030384492, 'code'), - (0.002923781, 'expert'), - (0.0028879363, 'string'), - (0.0027964872, 'learn'), - (0.0027678378, 'table'), - (0.0027654031, 'connection')], - -1.122278491657109), - ([(0.014161764, 'recognition'), - (0.011104057, 'speech'), - (0.010318562, 'word'), - (0.010277273, 'image'), - (0.00809512, 'object'), - (0.0063050594, 'signal'), - (0.0053472514, 'layer'), - (0.005024713, 'classification'), - (0.0050242324, 'face'), - (0.004580911, 'trained'), - (0.004409548, 'human'), - (0.0043301815, 'context'), - (0.0042581595, 'frame'), - (0.0040203724, 'hidden'), - (0.004008649, 'speaker'), - (0.0035841789, 'class'), - (0.0033736168, 'sequence'), - (0.0032663026, 'hmm'), - (0.0032505158, 'architecture'), - (0.0031761383, 'view')], - -1.1844643136695376), - ([(0.0071913837, 'matrix'), - (0.006639144, 'gradient'), - (0.0058832015, 'kernel'), - (0.0058791665, 'component'), - (0.0047264574, 'class'), - (0.0042780563, 'density'), - (0.004226884, 'xi'), - (0.004164046, 'convergence'), - (0.0041592806, 'source'), - (0.0040763966, 'loss'), - (0.00392406, 'basis'), - (0.0036241056, 'regression'), - (0.0035536229, 'approximation'), - (0.0033525354, 'independent'), - (0.0032649476, 'bound'), - (0.0031867179, 'mixture'), - (0.0031306876, 'let'), - (0.0030615225, 'signal'), - (0.0030061873, 'support'), - (0.0029361995, 'pca')], - -1.2550214906161075), - ([(0.012204602, 'tree'), - (0.010181904, 'node'), - (0.010171177, 'class'), - (0.007966109, 'classifier'), - (0.0075656017, 'decision'), - (0.005655141, 'rule'), - (0.0056041405, 'classification'), - (0.0054354756, 'sample'), - (0.0050921105, 'distance'), - (0.0046420856, 'bound'), - (0.0035473844, 'let'), - (0.0032015098, 'measure'), - (0.0031701634, 'cluster'), - (0.0030615227, 'clustering'), - (0.0030600468, 'graph'), - (0.003044858, 'neighbor'), - (0.0030077181, 'nearest'), - (0.0029182513, 'call'), - (0.0027482447, 'machine'), - (0.0027105191, 'hypothesis')], - -1.2831209969858721), - ([(0.016391048, 'control'), - (0.013031393, 'action'), - (0.009197483, 'policy'), - (0.008487638, 'reinforcement'), - (0.0068111503, 'controller'), - (0.0067618974, 'dynamic'), - (0.006282514, 'robot'), - (0.0061591244, 'optimal'), - (0.005933612, 'trajectory'), - (0.00556125, 'reinforcement_learning'), - (0.004895806, 'environment'), - (0.0044026882, 'goal'), - (0.0042024464, 'reward'), - (0.0037804258, 'position'), - (0.0037499247, 'arm'), - (0.003601292, 'motor'), - (0.0034139594, 'sutton'), - (0.0031908047, 'movement'), - (0.003142896, 'td'), - (0.0031323545, 'trial')], - -1.4003243935908478)] + 2022-04-22 17:45:28,224 : INFO : CorpusAccumulator accumulated stats from 1000 documents + Average topic coherence: -1.2010. + [([(0.009335279, 'matrix'), + (0.006810243, 'gradient'), + (0.0058767716, 'solution'), + (0.0050566536, 'convergence'), + (0.0043554083, 'distance'), + (0.004101262, 'minimum'), + (0.0040506367, 'let'), + (0.0039807004, 'eq'), + (0.0038555989, 'optimal'), + (0.0034886731, 'energy'), + (0.0034828722, 'optimization'), + (0.0034504435, 'condition'), + (0.0033918922, 'approximation'), + (0.0033640305, 'descent'), + (0.0032366295, 'constraint'), + (0.0032220806, 'xi'), + (0.003061566, 'stochastic'), + (0.0029803582, 'component'), + (0.0028803074, 'dynamic'), + (0.00280652, 'graph')], + -1.0314809310847135), + ([(0.006758064, 'class'), + (0.006583767, 'gaussian'), + (0.005633773, 'sample'), + (0.0053001167, 'estimate'), + (0.0049426625, 'density'), + (0.0048573534, 'mixture'), + (0.004835742, 'classifier'), + (0.0046612574, 'prior'), + (0.004377199, 'likelihood'), + (0.004344127, 'bayesian'), + (0.0043293545, 'classification'), + (0.0037983125, 'regression'), + (0.0037747815, 'noise'), + (0.003772593, 'log'), + (0.0037171794, 'kernel'), + (0.003717116, 'approximation'), + (0.0037102823, 'variance'), + (0.0034671598, 'component'), + (0.0032801689, 'posterior'), + (0.003173915, 'em')], + -1.0736087121706135), + ([(0.02519838, 'image'), + (0.013268676, 'object'), + (0.011446378, 'visual'), + (0.009458303, 'field'), + (0.008084482, 'motion'), + (0.006914001, 'direction'), + (0.0060067754, 'map'), + (0.0055346545, 'position'), + (0.004941865, 'pixel'), + (0.004847295, 'spatial'), + (0.0047093197, 'face'), + (0.0046589067, 'eye'), + (0.0046168645, 'location'), + (0.0043804147, 'filter'), + (0.0042905244, 'response'), + (0.0041273055, 'view'), + (0.0040860246, 'orientation'), + (0.0038862277, 'receptive'), + (0.0038229467, 'human'), + (0.0038166828, 'recognition')], + -1.101159857337566), + ([(0.015339, 'layer'), + (0.014894987, 'node'), + (0.010977563, 'net'), + (0.0097472165, 'hidden'), + (0.0075573265, 'threshold'), + (0.006544599, 'class'), + (0.006098466, 'bound'), + (0.005063979, 'activation'), + (0.0047261445, 'dimension'), + (0.0046081766, 'hidden_unit'), + (0.004463069, 'theorem'), + (0.0043413443, 'region'), + (0.0040992484, 'polynomial'), + (0.003927951, 'propagation'), + (0.003906715, 'hidden_layer'), + (0.003902104, 'back'), + (0.0034719643, 'let'), + (0.0034161368, 'bit'), + (0.0033824549, 'connection'), + (0.003204875, 'back_propagation')], + -1.1578264561349325), + ([(0.020037105, 'neuron'), + (0.01450755, 'cell'), + (0.014472483, 'spike'), + (0.011981914, 'signal'), + (0.011293252, 'response'), + (0.010934215, 'stimulus'), + (0.008777942, 'firing'), + (0.0077151447, 'frequency'), + (0.007196151, 'noise'), + (0.006772501, 'channel'), + (0.004612463, 'temporal'), + (0.0043820725, 'auditory'), + (0.0043365704, 'activity'), + (0.0040383274, 'sound'), + (0.004009629, 'potential'), + (0.0039981017, 'correlation'), + (0.0038944164, 'fig'), + (0.0036725644, 'train'), + (0.0034477867, 'firing_rate'), + (0.0033127973, 'source')], + -1.175461993278655), + ([(0.015848655, 'neuron'), + (0.015059427, 'cell'), + (0.009022958, 'activity'), + (0.008109199, 'connection'), + (0.008041161, 'synaptic'), + (0.0057249856, 'memory'), + (0.0053059673, 'cortex'), + (0.0050525647, 'dynamic'), + (0.0047387453, 'cortical'), + (0.004596282, 'simulation'), + (0.004441938, 'inhibitory'), + (0.004316362, 'phase'), + (0.004202166, 'response'), + (0.004129471, 'excitatory'), + (0.0041026585, 'attractor'), + (0.0036624784, 'synapsis'), + (0.003452054, 'fig'), + (0.003326298, 'interaction'), + (0.003292976, 'layer'), + (0.003188004, 'oscillator')], + -1.224961800422038), + ([(0.014448352, 'control'), + (0.011206106, 'action'), + (0.008610181, 'policy'), + (0.0073960284, 'reinforcement'), + (0.0071460134, 'dynamic'), + (0.006695718, 'trajectory'), + (0.006001844, 'optimal'), + (0.005919467, 'controller'), + (0.005142686, 'robot'), + (0.0049040187, 'reinforcement_learning'), + (0.004231131, 'environment'), + (0.0038927419, 'reward'), + (0.0036765926, 'goal'), + (0.0032516345, 'forward'), + (0.0029738136, 'arm'), + (0.0029553284, 'adaptive'), + (0.0029314642, 'sutton'), + (0.0029179594, 'position'), + (0.0028270711, 'path'), + (0.002815493, 'motor')], + -1.280662748184417), + ([(0.01465422, 'circuit'), + (0.0134508265, 'chip'), + (0.012013224, 'analog'), + (0.010762642, 'neuron'), + (0.008197728, 'signal'), + (0.007833759, 'voltage'), + (0.0075949323, 'memory'), + (0.0062134205, 'vlsi'), + (0.005665418, 'implementation'), + (0.00510467, 'bit'), + (0.004741555, 'noise'), + (0.004108878, 'processor'), + (0.004068751, 'pulse'), + (0.00402028, 'digital'), + (0.003979967, 'design'), + (0.0037854807, 'hardware'), + (0.0036803125, 'transistor'), + (0.0036066298, 'block'), + (0.0035669305, 'device'), + (0.0035628842, 'synapse')], + -1.2836262379148498), + ([(0.016415589, 'recognition'), + (0.0136875985, 'speech'), + (0.01258169, 'word'), + (0.0104766805, 'hidden'), + (0.0063662766, 'layer'), + (0.0061339615, 'character'), + (0.0056002084, 'trained'), + (0.005490037, 'context'), + (0.0051139165, 'sequence'), + (0.004984547, 'architecture'), + (0.004967922, 'hmm'), + (0.004862166, 'speaker'), + (0.004366162, 'net'), + (0.0042531807, 'digit'), + (0.0039046167, 'classification'), + (0.0037942464, 'class'), + (0.0037750585, 'frame'), + (0.00358875, 'mixture'), + (0.003476494, 'phoneme'), + (0.0034512014, 'letter')], + -1.323380921633785), + ([(0.008542947, 'rule'), + (0.00631226, 'hidden'), + (0.00597873, 'generalization'), + (0.0045754625, 'hidden_unit'), + (0.0043068537, 'prediction'), + (0.0040594153, 'net'), + (0.003990005, 'sequence'), + (0.0038032297, 'tree'), + (0.0035338537, 'machine'), + (0.0034035398, 'trained'), + (0.003242104, 'recurrent'), + (0.0031919426, 'training_set'), + (0.0029770972, 'table'), + (0.0028571628, 'learn'), + (0.0028489903, 'language'), + (0.0028364619, 'target'), + (0.0026097689, 'architecture'), + (0.0025739158, 'string'), + (0.0025172615, 'symbol'), + (0.0024356844, 'teacher')], + -1.3578438548773115)] @@ -959,9 +917,9 @@ References .. rst-class:: sphx-glr-timing - **Total running time of the script:** ( 2 minutes 47.007 seconds) + **Total running time of the script:** ( 4 minutes 13.971 seconds) -**Estimated memory usage:** 658 MB +**Estimated memory usage:** 664 MB .. _sphx_glr_download_auto_examples_tutorials_run_lda.py: diff --git a/docs/src/auto_examples/tutorials/sg_execution_times.rst b/docs/src/auto_examples/tutorials/sg_execution_times.rst index da986968c9..0dfaf2783f 100644 --- a/docs/src/auto_examples/tutorials/sg_execution_times.rst +++ b/docs/src/auto_examples/tutorials/sg_execution_times.rst @@ -5,18 +5,18 @@ Computation times ================= -**08:55.221** total execution time for **auto_examples_tutorials** files: +**04:13.971** total execution time for **auto_examples_tutorials** files: +-------------------------------------------------------------------------------------+-----------+----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` (``run_ensemblelda.py``) | 08:55.221 | 506.6 MB | +| :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` (``run_lda.py``) | 04:13.971 | 664.3 MB | +-------------------------------------------------------------------------------------+-----------+----------+ | :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` (``run_annoy.py``) | 00:00.000 | 0.0 MB | +-------------------------------------------------------------------------------------+-----------+----------+ | :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` (``run_doc2vec_lee.py``) | 00:00.000 | 0.0 MB | +-------------------------------------------------------------------------------------+-----------+----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` (``run_fasttext.py``) | 00:00.000 | 0.0 MB | +| :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` (``run_ensemblelda.py``) | 00:00.000 | 0.0 MB | +-------------------------------------------------------------------------------------+-----------+----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` (``run_lda.py``) | 00:00.000 | 0.0 MB | +| :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` (``run_fasttext.py``) | 00:00.000 | 0.0 MB | +-------------------------------------------------------------------------------------+-----------+----------+ | :ref:`sphx_glr_auto_examples_tutorials_run_scm.py` (``run_scm.py``) | 00:00.000 | 0.0 MB | +-------------------------------------------------------------------------------------+-----------+----------+ diff --git a/docs/src/check_gallery.py b/docs/src/check_gallery.py new file mode 100644 index 0000000000..d03726dabb --- /dev/null +++ b/docs/src/check_gallery.py @@ -0,0 +1,69 @@ +"""Check that the cached gallery files are up to date. + +If they are stale, then Sphinx will attempt to rebuild them from source. When +running the documentation build on CI, we want to avoid rebuilding the gallery, +because that takes too long. Instead, we use this script to warn the author of +the PR that they need to rebuild the docs themselves. +""" + +import hashlib +import os +import sys + + +def different(path1, path2): + with open(path1) as fin: + f1 = fin.read() + with open(path2) as fin: + f2 = fin.read() + return f1 != f2 + + +curr_dir = os.path.dirname(__file__) +stale = [] +for root, dirs, files in os.walk(os.path.join(curr_dir, 'gallery')): + for f in files: + if f.endswith('.py'): + source_path = os.path.join(root, f) + cache_path = source_path.replace('docs/src/gallery/', 'docs/src/auto_examples/') + + # + # We check two things: + # + # 1) Actual file content + # 2) MD5 checksums + # + # We check 1) because that's the part that matters to the user - + # it's what will appear in the documentation. We check 2) because + # that's what Sphinx Gallery relies on to decide what it needs to + # rebuild. In practice, only one of these checks is necessary, + # but we run them both because it's trivial. + # + if different(source_path, cache_path): + stale.append(cache_path) + continue + + actual_md5 = hashlib.md5() + with open(source_path, 'rb') as fin: + actual_md5.update(fin.read()) + + md5_path = cache_path + '.md5' + with open(md5_path) as fin: + expected_md5 = fin.read() + + if actual_md5.hexdigest() != expected_md5: + stale.append(cache_path) + +if stale: + print(f"""The gallery cache appears stale. + +Rebuild the documentation using the following commands from the gensim root subdirectory: + + pip install -e .[docs] + make -C docs/src html + +and then run `git add docs/src/auto_examples` to update the cache. + +Stale files: {stale} +""", file=sys.stderr) + sys.exit(1) diff --git a/docs/src/conf.py b/docs/src/conf.py index 007c219da7..168d4cf58e 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -61,9 +61,9 @@ # built documents. # # The short X.Y version. -version = '4.1' +version = '4.2.0' # The full version, including alpha/beta/rc tags. -release = '4.1.2' +release = '4.2.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/src/gallery/core/run_corpora_and_vector_spaces.py b/docs/src/gallery/core/run_corpora_and_vector_spaces.py index 983a9d1235..d02e7d3418 100644 --- a/docs/src/gallery/core/run_corpora_and_vector_spaces.py +++ b/docs/src/gallery/core/run_corpora_and_vector_spaces.py @@ -222,7 +222,7 @@ def __iter__(self): ############################################################################### # Other formats include `Joachim's SVMlight format `_, -# `Blei's LDA-C format `_ and +# `Blei's LDA-C format `_ and # `GibbsLDA++ format `_. corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus) diff --git a/docs/src/gallery/tutorials/run_lda.py b/docs/src/gallery/tutorials/run_lda.py index 2ec06a801c..7ee6b07cd2 100644 --- a/docs/src/gallery/tutorials/run_lda.py +++ b/docs/src/gallery/tutorials/run_lda.py @@ -245,7 +245,7 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz' iterations = 400 eval_every = None # Don't evaluate model perplexity, takes too much time. -# Make a index to word dictionary. +# Make an index to word dictionary. temp = dictionary[0] # This is only to "load" the dictionary. id2word = dictionary.id2token @@ -278,7 +278,7 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz' # methods on the blog at http://rare-technologies.com/lda-training-tips/ ! # -top_topics = model.top_topics(corpus) #, num_words=20) +top_topics = model.top_topics(corpus) # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics diff --git a/docs/src/models/doc2vec.rst b/docs/src/models/doc2vec.rst index b5d2e290b5..99d4d27f01 100644 --- a/docs/src/models/doc2vec.rst +++ b/docs/src/models/doc2vec.rst @@ -7,3 +7,4 @@ :inherited-members: :undoc-members: :show-inheritance: + :special-members: __getitem__ diff --git a/docs/src/models/fasttext.rst b/docs/src/models/fasttext.rst index e65b43fd25..392e68f2fd 100644 --- a/docs/src/models/fasttext.rst +++ b/docs/src/models/fasttext.rst @@ -5,6 +5,6 @@ :synopsis: FastText model :members: :inherited-members: - :special-members: __getitem__ + :special-members: __getitem__, __contains__ :undoc-members: :show-inheritance: diff --git a/docs/src/models/keyedvectors.rst b/docs/src/models/keyedvectors.rst index db07e034e8..f51e03228d 100644 --- a/docs/src/models/keyedvectors.rst +++ b/docs/src/models/keyedvectors.rst @@ -7,3 +7,4 @@ :inherited-members: :undoc-members: :show-inheritance: + :special-members: __getitem__, __setitem__, __contains__ diff --git a/docs/src/models/ldamodel.rst b/docs/src/models/ldamodel.rst index 2dfb736ea6..d1bf9632fc 100644 --- a/docs/src/models/ldamodel.rst +++ b/docs/src/models/ldamodel.rst @@ -4,6 +4,7 @@ .. automodule:: gensim.models.ldamodel :synopsis: Latent Dirichlet Allocation :members: + :special-members: __getitem__ :inherited-members: :undoc-members: :show-inheritance: diff --git a/docs/src/models/lsimodel.rst b/docs/src/models/lsimodel.rst index fec09efbf4..278d39cf0b 100644 --- a/docs/src/models/lsimodel.rst +++ b/docs/src/models/lsimodel.rst @@ -4,6 +4,7 @@ .. automodule:: gensim.models.lsimodel :synopsis: Latent Semantic Indexing :members: + :special-members: __getitem__ :inherited-members: :undoc-members: :show-inheritance: diff --git a/docs/src/similarities/fastss.rst b/docs/src/similarities/fastss.rst new file mode 100644 index 0000000000..66dc0936a1 --- /dev/null +++ b/docs/src/similarities/fastss.rst @@ -0,0 +1,8 @@ +:mod:`similarities.fastss` -- Fast Levenshtein edit distance +================================================================== + +.. automodule:: gensim.similarities.fastss + :synopsis: Fast fuzzy search between strings, using the Levenshtein edit distance + :members: + :inherited-members: + diff --git a/docs/src/similarities/levenshtein.rst b/docs/src/similarities/levenshtein.rst new file mode 100644 index 0000000000..b5be710589 --- /dev/null +++ b/docs/src/similarities/levenshtein.rst @@ -0,0 +1,8 @@ +:mod:`similarities.levenshtein` -- Fast soft-cosine semantic similarity search +============================================================================== + +.. automodule:: gensim.similarities.levenshtein + :synopsis: Fast fuzzy search between strings, using the Soft-Cosine Semantic Similarity + :members: + :inherited-members: + diff --git a/gensim/__init__.py b/gensim/__init__.py index cf85b8bc4e..b5f915a3ed 100644 --- a/gensim/__init__.py +++ b/gensim/__init__.py @@ -4,7 +4,7 @@ """ -__version__ = '4.1.2' +__version__ = '4.2.0' import logging diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index d954061caf..51ec35038f 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -143,7 +143,9 @@ def __len__(self): def __str__(self): some_keys = list(itertools.islice(self.token2id.keys(), 5)) - return "Dictionary(%i unique tokens: %s%s)" % (len(self), some_keys, '...' if len(self) > 5 else '') + return "%s<%i unique tokens: %s%s>" % ( + self.__class__.__name__, len(self), some_keys, '...' if len(self) > 5 else '' + ) @staticmethod def from_documents(documents): @@ -328,7 +330,9 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N After the pruning, resulting gaps in word ids are shrunk. Due to this gap shrinking, **the same word may have a different word id before and after the call - to this function!** + to this function!** See :class:`gensim.models.VocabTransform` and the + `dedicated FAQ entry `_ on how # noqa + to transform a corpus built with a dictionary before pruning. Examples -------- diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py index c2b8b620bf..b4406c248a 100644 --- a/gensim/corpora/textcorpus.py +++ b/gensim/corpora/textcorpus.py @@ -33,7 +33,6 @@ """ - from __future__ import with_statement import logging @@ -50,6 +49,8 @@ ) from gensim.utils import deaccent, simple_tokenize +from smart_open import open + logger = logging.getLogger(__name__) @@ -399,7 +400,7 @@ class TextDirectoryCorpus(TextCorpus): """ def __init__(self, input, dictionary=None, metadata=False, min_depth=0, max_depth=None, - pattern=None, exclude_pattern=None, lines_are_documents=False, **kwargs): + pattern=None, exclude_pattern=None, lines_are_documents=False, encoding='utf-8', **kwargs): """ Parameters @@ -423,6 +424,8 @@ def __init__(self, input, dictionary=None, metadata=False, min_depth=0, max_dept Regex to use for file name exclusion, all files matching this pattern will be ignored. lines_are_documents : bool, optional If True - each line is considered a document, otherwise - each file is one document. + encoding : str, optional + Encoding used to read the specified file or files in the specified directory. kwargs: keyword arguments passed through to the `TextCorpus` constructor. See :meth:`gemsim.corpora.textcorpus.TextCorpus.__init__` docstring for more details on these. @@ -432,6 +435,7 @@ def __init__(self, input, dictionary=None, metadata=False, min_depth=0, max_dept self.pattern = pattern self.exclude_pattern = exclude_pattern self.lines_are_documents = lines_are_documents + self.encoding = encoding super(TextDirectoryCorpus, self).__init__(input, dictionary, metadata, **kwargs) @property @@ -510,7 +514,7 @@ def getstream(self): """ num_texts = 0 for path in self.iter_filepaths(): - with open(path, 'rt') as f: + with open(path, 'rt', encoding=self.encoding) as f: if self.lines_are_documents: for line in f: yield line.strip() diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 5f4c173b8a..ee8c4ef281 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -452,8 +452,10 @@ def extract_pages(f, filter_namespaces=False, filter_articles=None): _extract_pages = extract_pages # for backward compatibility -def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, - token_max_len=TOKEN_MAX_LEN, lower=True): +def process_article( + args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, + token_max_len=TOKEN_MAX_LEN, lower=True, + ): """Parse a Wikipedia article, extract all tokens. Notes @@ -525,7 +527,7 @@ def _process_article(args): return process_article( args, tokenizer_func=tokenizer_func, token_min_len=token_min_len, - token_max_len=token_max_len, lower=lower + token_max_len=token_max_len, lower=lower, ) @@ -567,9 +569,11 @@ class WikiCorpus(TextCorpus): >>> MmCorpus.serialize(corpus_path, wiki) # another 8h, creates a file in MatrixMarket format and mapping """ - def __init__(self, fname, processes=None, lemmatize=None, dictionary=None, - filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS, - token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None): + def __init__( + self, fname, processes=None, lemmatize=None, dictionary=None, metadata=False, + filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS, + token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None, + ): """Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, @@ -602,6 +606,9 @@ def __init__(self, fname, processes=None, lemmatize=None, dictionary=None, If set, each XML article element will be passed to this callable before being processed. Only articles where the callable returns an XML element are processed, returning None allows filtering out some articles based on customised rules. + metadata: bool + Have the `get_texts()` method yield `(content_tokens, (page_id, page_title))` tuples, instead + of just `content_tokens`. Warnings -------- @@ -618,7 +625,7 @@ def __init__(self, fname, processes=None, lemmatize=None, dictionary=None, self.fname = fname self.filter_namespaces = filter_namespaces self.filter_articles = filter_articles - self.metadata = False + self.metadata = metadata if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) self.processes = processes diff --git a/gensim/examples/dmlcz/dmlcorpus.py b/gensim/examples/dmlcz/dmlcorpus.py index d76c622c95..24aca6cb65 100644 --- a/gensim/examples/dmlcz/dmlcorpus.py +++ b/gensim/examples/dmlcz/dmlcorpus.py @@ -59,8 +59,9 @@ def addSource(self, source): self.sources[sourceId] = source def __str__(self): - return ("DmlConfig(id=%s, sources=[%s], acceptLangs=[%s])" % - (self.configId, ', '.join(self.sources.iterkeys()), ', '.join(self.acceptLangs))) + return "%s" % ( + self.__class__.__name__, self.configId, ', '.join(self.sources.iterkeys()), ', '.join(self.acceptLangs) + ) # endclass DmlConfig diff --git a/gensim/interfaces.py b/gensim/interfaces.py index 3358adaab5..c685602e57 100644 --- a/gensim/interfaces.py +++ b/gensim/interfaces.py @@ -29,19 +29,19 @@ class CorpusABC(utils.SaveLoad): .. sourcecode:: pycon - >>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class + >>> from gensim.corpora import MmCorpus # inherits from the CorpusABC class >>> from gensim.test.utils import datapath >>> >>> corpus = MmCorpus(datapath("testcorpus.mm")) >>> for doc in corpus: ... pass # do something with the doc... - A document represented in bag-of-word (BoW) format, i.e. list of (attr_id, attr_value), + A document represented in the bag-of-word (BoW) format, i.e. list of (attr_id, attr_value), like ``[(1, 0.2), (4, 0.6), ...]``. .. sourcecode:: pycon - >>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class + >>> from gensim.corpora import MmCorpus # inherits from the CorpusABC class >>> from gensim.test.utils import datapath >>> >>> corpus = MmCorpus(datapath("testcorpus.mm")) @@ -49,28 +49,28 @@ class CorpusABC(utils.SaveLoad): >>> print(doc) [(0, 1.0), (1, 1.0), (2, 1.0)] - Remember, that save/load methods save only corpus class (not corpus as data itself), - for save/load functionality, please use this pattern : + Remember that the save/load methods only pickle the corpus object, not + the (streamed) corpus data itself! + To save the corpus data, please use this pattern : .. sourcecode:: pycon - >>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class + >>> from gensim.corpora import MmCorpus # MmCorpus inherits from CorpusABC >>> from gensim.test.utils import datapath, get_tmpfile >>> >>> corpus = MmCorpus(datapath("testcorpus.mm")) >>> tmp_path = get_tmpfile("temp_corpus.mm") >>> - >>> MmCorpus.serialize(tmp_path, corpus) # serialize corpus to disk in MmCorpus format - >>> # MmCorpus.save_corpus(tmp_path, corpus) # this variant also possible, but if serialize availbe - call it. + >>> MmCorpus.serialize(tmp_path, corpus) # serialize corpus to disk in the MmCorpus format >>> loaded_corpus = MmCorpus(tmp_path) # load corpus through constructor >>> for (doc_1, doc_2) in zip(corpus, loaded_corpus): - ... assert doc_1 == doc_2 # check that corpuses exactly same + ... assert doc_1 == doc_2 # no change between the original and loaded corpus See Also -------- :mod:`gensim.corpora` - Corpuses in different formats + Corpora in different formats. """ def __iter__(self): @@ -78,14 +78,14 @@ def __iter__(self): raise NotImplementedError('cannot instantiate abstract base class') def save(self, *args, **kwargs): - """Saves corpus in-memory state. + """Saves the in-memory state of the corpus (pickles the object). Warnings -------- - This save only the "state" of a corpus class, not the corpus data! + This saves only the "internal state" of the corpus object, not the corpus data! - For saving data use the `serialize` method of the output format you'd like to use - (e.g. :meth:`gensim.corpora.mmcorpus.MmCorpus.serialize`). + To save the corpus data, use the `serialize` method of your desired output format + instead, e.g. :meth:`gensim.corpora.mmcorpus.MmCorpus.serialize`. """ import warnings diff --git a/gensim/matutils.py b/gensim/matutils.py index fb2c54e680..4d4064acc0 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -597,23 +597,30 @@ def __iter__(self): def __len__(self): return self.sparse.shape[1] - def __getitem__(self, document_index): - """Retrieve a document vector from the corpus by its index. + def __getitem__(self, key): + """ + Retrieve a document vector or subset from the corpus by key. Parameters ---------- - document_index : int - Index of document + key: int, ellipsis, slice, iterable object + Index of the document retrieve. + Less commonly, the key can also be a slice, ellipsis, or an iterable + to retrieve multiple documents. Returns ------- - list of (int, number) - Document in BoW format. - + list of (int, number), Sparse2Corpus + Document in BoW format when `key` is an integer. Otherwise :class:`~gensim.matutils.Sparse2Corpus`. """ - indprev = self.sparse.indptr[document_index] - indnow = self.sparse.indptr[document_index + 1] - return list(zip(self.sparse.indices[indprev:indnow], self.sparse.data[indprev:indnow])) + sparse = self.sparse + if isinstance(key, int): + iprev = self.sparse.indptr[key] + inow = self.sparse.indptr[key + 1] + return list(zip(sparse.indices[iprev:inow], sparse.data[iprev:inow])) + + sparse = self.sparse.__getitem__((slice(None, None, None), key)) + return Sparse2Corpus(sparse) def veclen(vec): diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 838c7634e3..75893c5ac0 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -331,8 +331,8 @@ def __str__(self): String representation of current instance. """ - return "AuthorTopicModel(num_terms=%s, num_topics=%s, num_authors=%s, decay=%s, chunksize=%s)" % \ - (self.num_terms, self.num_topics, self.num_authors, self.decay, self.chunksize) + return "%s" % \ + (self.__class__.__name__, self.num_terms, self.num_topics, self.num_authors, self.decay, self.chunksize) def init_empty_corpus(self): """Initialize an empty corpus. diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index c4b28316b7..20a739f64a 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -130,7 +130,7 @@ def __str__(self): Human readable representation of the object's state (words and tags). """ - return '%s(%s, %s)' % (self.__class__.__name__, self.words, self.tags) + return '%s<%s, %s>' % (self.__class__.__name__, self.words, self.tags) @dataclass @@ -156,9 +156,11 @@ def count(self, new_val): class Doc2Vec(Word2Vec): - def __init__(self, documents=None, corpus_file=None, vector_size=100, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, - dm_tag_count=1, dv=None, dv_mapfile=None, comment=None, trim_rule=None, callbacks=(), - window=5, epochs=10, shrink_windows=True, **kwargs): + def __init__( + self, documents=None, corpus_file=None, vector_size=100, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, + dm_tag_count=1, dv=None, dv_mapfile=None, comment=None, trim_rule=None, callbacks=(), + window=5, epochs=10, shrink_windows=True, **kwargs, + ): """Class for training, using and evaluating neural networks described in `Distributed Representations of Sentences and Documents `_. @@ -219,7 +221,7 @@ def __init__(self, documents=None, corpus_file=None, vector_size=100, dm_mean=No More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that other values may perform better for recommendation applications. dm_mean : {1,0}, optional - If 0 , use the sum of the context word vectors. If 1, use the mean. + If 0, use the sum of the context word vectors. If 1, use the mean. Only applies when `dm` is used in non-concatenative mode. dm_concat : {1,0}, optional If 1, use concatenation of context vectors rather than sum/average; @@ -494,7 +496,7 @@ def train( """ if corpus_file is None and corpus_iterable is None: - raise TypeError("Either one of corpus_file or documents value must be provided") + raise TypeError("Either one of corpus_file or corpus_iterable value must be provided") if corpus_file is not None and corpus_iterable is not None: raise TypeError("Both corpus_file and corpus_iterable must not be provided at the same time") @@ -655,7 +657,7 @@ def infer_vector(self, doc_words, alpha=None, min_alpha=None, epochs=None): return doctag_vectors[0] def __getitem__(self, tag): - """Get the vector representation of (possible multi-term) tag. + """Get the vector representation of (possibly multi-term) tag. Parameters ---------- @@ -713,7 +715,7 @@ def __str__(self): segments.append('s%g' % self.sample) if self.workers > 1: segments.append('t%d' % self.workers) - return '%s(%s)' % (self.__class__.__name__, ','.join(segments)) + return '%s<%s>' % (self.__class__.__name__, ','.join(segments)) def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False): """Store the input-hidden weight matrix in the same format used by the original C word2vec-tool. @@ -836,8 +838,10 @@ def estimate_memory(self, vocab_size=None, report=None): report['doctag_syn0'] = len(self.dv) * self.vector_size * dtype(REAL).itemsize return super(Doc2Vec, self).estimate_memory(vocab_size, report=report) - def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000, - keep_raw_vocab=False, trim_rule=None, **kwargs): + def build_vocab( + self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000, + keep_raw_vocab=False, trim_rule=None, **kwargs, + ): """Build vocabulary from a sequence of documents (can be a once-only generator stream). Parameters @@ -877,7 +881,7 @@ def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, prog """ total_words, corpus_count = self.scan_vocab( corpus_iterable=corpus_iterable, corpus_file=corpus_file, - progress_per=progress_per, trim_rule=trim_rule + progress_per=progress_per, trim_rule=trim_rule, ) self.corpus_count = corpus_count self.corpus_total_words = total_words @@ -959,7 +963,7 @@ def _scan_vocab(self, corpus_iterable, progress_per, trim_rule): if document_no % progress_per == 0: interval_rate = (total_words - interval_count) / (default_timer() - interval_start) logger.info( - "PROGRESS: at example #%i, processed %i words (%i/s), %i word types, %i tags", + "PROGRESS: at example #%i, processed %i words (%i words/s), %i word types, %i tags", document_no, total_words, interval_rate, len(vocab), len(doctags_list) ) interval_start = default_timer() @@ -993,7 +997,9 @@ def _scan_vocab(self, corpus_iterable, progress_per, trim_rule): logger.warning( "Highest int doctag (%i) larger than count of documents (%i). This means " "at least %i excess, unused slots (%i bytes) will be allocated for vectors.", - max_rawint, corpus_count, ((max_rawint - corpus_count) * self.vector_size * 4)) + max_rawint, corpus_count, max_rawint - corpus_count, + (max_rawint - corpus_count) * self.vector_size * dtype(REAL).itemsize, + ) if max_rawint > -1: # adjust indexes/list to account for range of pure-int keyed doctags for key in doctags_list: @@ -1008,8 +1014,8 @@ def _scan_vocab(self, corpus_iterable, progress_per, trim_rule): self.raw_vocab = vocab return total_words, corpus_count - def scan_vocab(self, corpus_iterable=None, corpus_file=None, progress_per=10000, trim_rule=None): - """Create the models Vocabulary: A mapping from unique words in the corpus to their frequency count. + def scan_vocab(self, corpus_iterable=None, corpus_file=None, progress_per=100000, trim_rule=None): + """Create the model's vocabulary: a mapping from unique words in the corpus to their frequency count. Parameters ---------- @@ -1038,7 +1044,7 @@ def scan_vocab(self, corpus_iterable=None, corpus_file=None, progress_per=10000, Returns ------- (int, int) - Tuple of (Total words in the corpus, number of documents) + Tuple of `(total words in the corpus, number of documents)`. """ logger.info("collecting all words and their counts") @@ -1049,7 +1055,7 @@ def scan_vocab(self, corpus_iterable=None, corpus_file=None, progress_per=10000, logger.info( "collected %i word types and %i unique tags from a corpus of %i examples and %i words", - len(self.raw_vocab), len(self.dv), corpus_count, total_words + len(self.raw_vocab), len(self.dv), corpus_count, total_words, ) return total_words, corpus_count diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index a94bc17f27..7c0ec8501b 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -38,8 +38,8 @@ >>> print(len(common_texts)) 9 >>> model = FastText(vector_size=4, window=3, min_count=1) # instantiate - >>> model.build_vocab(sentences=common_texts) - >>> model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10) # train + >>> model.build_vocab(corpus_iterable=common_texts) + >>> model.train(corpus_iterable=common_texts, total_examples=len(common_texts), epochs=10) # train Once you have a model, you can access its keyed vectors via the `model.wv` attributes. The keyed vectors instance is quite powerful: it can perform a wide range of NLP tasks. @@ -108,9 +108,9 @@ >>> >>> >>> model4 = FastText(vector_size=4, window=3, min_count=1) - >>> model4.build_vocab(sentences=MyIter()) + >>> model4.build_vocab(corpus_iterable=MyIter()) >>> total_examples = model4.corpus_count - >>> model4.train(sentences=MyIter(), total_examples=total_examples, epochs=5) + >>> model4.train(corpus_iterable=MyIter(), total_examples=total_examples, epochs=5) Persist a model to disk with: @@ -968,11 +968,6 @@ def __init__(self, vector_size, min_n, max_n, bucket, count=0, dtype=REAL): buckets_word : list of np.array For each key (by its index), report bucket slots their subwords map to. - When used in training, FastTextKeyedVectors may be decorated with - extra attributes that closely associate with its core attributes, - such as the experimental vectors_vocab_lockf and vectors_ngrams_lockf - training-update-dampening factors. - """ super(FastTextKeyedVectors, self).__init__(vector_size=vector_size, count=count, dtype=dtype) self.min_n = min_n @@ -1045,7 +1040,7 @@ def __contains__(self, word): Note ---- - This method **always** returns True, because of the way FastText works. + This method **always** returns True with char ngrams, because of the way FastText works. If you want to check if a word is an in-vocabulary term, use this instead: @@ -1059,7 +1054,10 @@ def __contains__(self, word): False """ - return True + if self.bucket == 0: # check for the case when char ngrams not used + return word in self.key_to_index + else: + return True def save(self, *args, **kwargs): """Save object. @@ -1131,6 +1129,23 @@ def get_vector(self, word, norm=False): else: return word_vec / len(ngram_hashes) + def get_sentence_vector(self, sentence): + """Get a single 1-D vector representation for a given `sentence`. + This function is workalike of the official fasttext's get_sentence_vector(). + + Parameters + ---------- + sentence : list of (str or int) + list of words specified by string or int ids. + + Returns + ------- + numpy.ndarray + 1-D numpy array representation of the `sentence`. + + """ + return super(FastTextKeyedVectors, self).get_mean_vector(sentence) + def resize_vectors(self, seed=0): """Make underlying vectors match 'index_to_key' size; random-initialize any new rows.""" diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index b5debb21c1..8f86b807f2 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -174,8 +174,8 @@ from typing import Iterable from numpy import ( - dot, float32 as REAL, double, array, zeros, vstack, - ndarray, sum as np_sum, prod, argmax, dtype, ascontiguousarray, frombuffer, + dot, float32 as REAL, double, zeros, vstack, ndarray, + sum as np_sum, prod, argmax, dtype, ascontiguousarray, frombuffer, ) import numpy as np from scipy import stats @@ -203,6 +203,9 @@ def _ensure_list(value): if isinstance(value, _KEY_TYPES) or (isinstance(value, ndarray) and len(value.shape) == 1): return [value] + if isinstance(value, ndarray) and len(value.shape) == 2: + return list(value) + return value @@ -253,6 +256,9 @@ def __init__(self, vector_size, count=0, dtype=np.float32, mapfile_path=None): self.mapfile_path = mapfile_path + def __str__(self): + return f"{self.__class__.__name__}" + def _load_specials(self, *args, **kwargs): """Handle special requirements of `.load()` protocol, usually up-converting older versions.""" super(KeyedVectors, self)._load_specials(*args, **kwargs) @@ -274,6 +280,9 @@ def _load_specials(self, *args, **kwargs): # fixup rename of vocab into map if 'key_to_index' not in self.__dict__: self._upconvert_old_vocab() + # ensure older instances have next_index + if not hasattr(self, 'next_index'): + self.next_index = len(self) def _upconvert_old_vocab(self): """Convert a loaded, pre-gensim-4.0.0 version instance that had a 'vocab' dict of data objects.""" @@ -450,6 +459,71 @@ def word_vec(self, *args, **kwargs): """Compatibility alias for get_vector(); must exist so subclass calls reach subclass get_vector().""" return self.get_vector(*args, **kwargs) + def get_mean_vector(self, keys, weights=None, pre_normalize=True, post_normalize=False, ignore_missing=True): + """Get the mean vector for a given list of keys. + + Parameters + ---------- + + keys : list of (str or int or ndarray) + Keys specified by string or int ids or numpy array. + weights : list of float or numpy.ndarray, optional + 1D array of same size of `keys` specifying the weight for each key. + pre_normalize : bool, optional + Flag indicating whether to normalize each keyvector before taking mean. + If False, individual keyvector will not be normalized. + post_normalize: bool, optional + Flag indicating whether to normalize the final mean vector. + If True, normalized mean vector will be return. + ignore_missing : bool, optional + If False, will raise error if a key doesn't exist in vocabulary. + + Returns + ------- + + numpy.ndarray + Mean vector for the list of keys. + + Raises + ------ + + ValueError + If the size of the list of `keys` and `weights` doesn't match. + KeyError + If any of the key doesn't exist in vocabulary and `ignore_missing` is false. + + """ + if len(keys) == 0: + raise ValueError("cannot compute mean with no input") + if isinstance(weights, list): + weights = np.array(weights) + if weights is None: + weights = np.ones(len(keys)) + if len(keys) != weights.shape[0]: # weights is a 1-D numpy array + raise ValueError( + "keys and weights array must have same number of elements" + ) + + mean = np.zeros(self.vector_size, self.vectors.dtype) + + total_weight = 0 + for idx, key in enumerate(keys): + if isinstance(key, ndarray): + mean += weights[idx] * key + total_weight += abs(weights[idx]) + elif self.__contains__(key): + vec = self.get_vector(key, norm=pre_normalize) + mean += weights[idx] * vec + total_weight += abs(weights[idx]) + elif not ignore_missing: + raise KeyError(f"Key '{key}' not present in vocabulary") + + if(total_weight > 0): + mean = mean / total_weight + if post_normalize: + mean = matutils.unitvec(mean).astype(REAL) + return mean + def add_vector(self, key, vector): """Add one new vector at the given key, into existing slot if available. @@ -714,10 +788,10 @@ def most_similar( Parameters ---------- - positive : list of (str or int or ndarray), optional - List of keys that contribute positively. - negative : list of (str or int or ndarray), optional - List of keys that contribute negatively. + positive : list of (str or int or ndarray) or list of ((str,float) or (int,float) or (ndarray,float)), optional + List of keys that contribute positively. If tuple, second element specifies the weight (default `1.0`) + negative : list of (str or int or ndarray) or list of ((str,float) or (int,float) or (ndarray,float)), optional + List of keys that contribute negatively. If tuple, second element specifies the weight (default `-1.0`) topn : int or None, optional Number of top-N similar keys to return, when `topn` is int. When `topn` is None, then similarities for all keys are returned. @@ -755,27 +829,20 @@ def most_similar( clip_end = restrict_vocab # add weights for each key, if not already present; default to 1.0 for positive and -1.0 for negative keys - positive = [ - (item, 1.0) if isinstance(item, _EXTENDED_KEY_TYPES) else item - for item in positive - ] - negative = [ - (item, -1.0) if isinstance(item, _EXTENDED_KEY_TYPES) else item - for item in negative - ] + keys = [] + weight = np.concatenate((np.ones(len(positive)), -1.0 * np.ones(len(negative)))) + for idx, item in enumerate(positive + negative): + if isinstance(item, _EXTENDED_KEY_TYPES): + keys.append(item) + else: + keys.append(item[0]) + weight[idx] = item[1] # compute the weighted average of all keys - all_keys, mean = set(), [] - for key, weight in positive + negative: - if isinstance(key, ndarray): - mean.append(weight * key) - else: - mean.append(weight * self.get_vector(key, norm=True)) - if self.has_index_for(key): - all_keys.add(self.get_index(key)) - if not mean: - raise ValueError("cannot compute similarity with no input") - mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) + mean = self.get_mean_vector(keys, weight, pre_normalize=True, post_normalize=True, ignore_missing=False) + all_keys = [ + self.get_index(key) for key in keys if isinstance(key, _KEY_TYPES) and self.has_index_for(key) + ] if indexer is not None and isinstance(topn, int): return indexer.most_similar(mean, topn) @@ -943,7 +1010,9 @@ def nbow(document): # Compute WMD. return emd(d1, d2, distance_matrix) - def most_similar_cosmul(self, positive=None, negative=None, topn=10): + def most_similar_cosmul( + self, positive=None, negative=None, topn=10, restrict_vocab=None + ): """Find the top-N most similar words, using the multiplicative combination objective, proposed by `Omer Levy and Yoav Goldberg "Linguistic Regularities in Sparse and Explicit Word Representations" `_. Positive words still contribute positively towards the similarity, @@ -956,6 +1025,9 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10): With a single positive example, rankings will be the same as in the default :meth:`~gensim.models.keyedvectors.KeyedVectors.most_similar`. + Allows calls like most_similar_cosmul('dog', 'cat'), as a shorthand for + most_similar_cosmul(['dog'], ['cat']) where 'dog' is positive and 'cat' negative + Parameters ---------- positive : list of str, optional @@ -965,6 +1037,11 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10): topn : int or None, optional Number of top-N similar words to return, when `topn` is int. When `topn` is None, then similarities for all words are returned. + restrict_vocab : int or None, optional + Optional integer which limits the range of vectors which are searched for most-similar values. + For example, restrict_vocab=10000 would only check the first 10000 node vectors in the vocabulary order. + This may be meaningful if vocabulary is sorted by descending frequency. + Returns ------- @@ -982,7 +1059,14 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10): positive = _ensure_list(positive) negative = _ensure_list(negative) - self.fill_norms() + self.init_sims() + + if isinstance(positive, str): + # allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog']) + positive = [positive] + + if isinstance(negative, str): + negative = [negative] all_words = { self.get_index(word) for word in positive + negative @@ -1039,7 +1123,7 @@ def rank_by_centrality(self, words, use_norm=True): if not used_words: raise ValueError("cannot select a word from an empty list") vectors = vstack([self.get_vector(word, norm=use_norm) for word in used_words]).astype(REAL) - mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL) + mean = self.get_mean_vector(vectors, post_normalize=True) dists = dot(vectors, mean) return sorted(zip(dists, used_words), reverse=True) @@ -1171,9 +1255,9 @@ def n_similarity(self, ws1, ws2): """ if not(len(ws1) and len(ws2)): raise ZeroDivisionError('At least one of the passed list is empty.') - v1 = [self[key] for key in ws1] - v2 = [self[key] for key in ws2] - return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0))) + mean1 = self.get_mean_vector(ws1, pre_normalize=False) + mean2 = self.get_mean_vector(ws2, pre_normalize=False) + return dot(matutils.unitvec(mean1), matutils.unitvec(mean2)) @staticmethod def _log_evaluate_word_analogies(section): @@ -1202,7 +1286,9 @@ def _log_evaluate_word_analogies(section): logger.info("%s: %.1f%% (%i/%i)", section['section'], 100.0 * score, correct, correct + incorrect) return score - def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): + def evaluate_word_analogies( + self, analogies, restrict_vocab=300000, case_insensitive=True, + dummy4unknown=False, similarity_function='most_similar'): """Compute performance of the model on an analogy test set. The accuracy is reported (printed to log and returned as a score) for each section separately, @@ -1228,6 +1314,8 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi dummy4unknown : bool, optional If True - produce zero accuracies for 4-tuples with out-of-vocabulary words. Otherwise, these tuples are skipped entirely and not used in the evaluation. + similarity_function : str, optional + Function name used for similarity calculation. Returns ------- @@ -1283,6 +1371,7 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi predicted = None # find the most likely prediction using 3CosAdd (vector offset) method # TODO: implement 3CosMul and set-based methods for solving analogies + sims = self.most_similar(positive=[b, c], negative=[a], topn=5, restrict_vocab=restrict_vocab) self.key_to_index = original_key_to_index for element in sims: @@ -1334,7 +1423,8 @@ def log_evaluate_word_pairs(pearson, spearman, oov, pairs): logger.info('Pairs with unknown words ratio: %.1f%%', oov) def evaluate_word_pairs( - self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False, + self, pairs, delimiter='\t', encoding='utf8', + restrict_vocab=300000, case_insensitive=True, dummy4unknown=False, ): """Compute correlation of the model with human similarity judgments. @@ -1385,16 +1475,12 @@ def evaluate_word_pairs( similarity_model = [] oov = 0 - original_key_to_index = self.key_to_index - self.key_to_index = ok_vocab - - with utils.open(pairs, 'rb') as fin: - for line_no, line in enumerate(fin): - line = utils.to_unicode(line) - if line.startswith('#'): - # May be a comment - continue - else: + original_key_to_index, self.key_to_index = self.key_to_index, ok_vocab + try: + with utils.open(pairs, encoding=encoding) as fin: + for line_no, line in enumerate(fin): + if not line or line.startswith('#'): # Ignore lines with comments. + continue try: if case_insensitive: a, b, sim = [word.upper() for word in line.split(delimiter)] @@ -1404,19 +1490,27 @@ def evaluate_word_pairs( except (ValueError, TypeError): logger.info('Skipping invalid line #%d in %s', line_no, pairs) continue + if a not in ok_vocab or b not in ok_vocab: oov += 1 if dummy4unknown: logger.debug('Zero similarity for line #%d with OOV words: %s', line_no, line.strip()) similarity_model.append(0.0) similarity_gold.append(sim) - continue else: - logger.debug('Skipping line #%d with OOV words: %s', line_no, line.strip()) - continue + logger.info('Skipping line #%d with OOV words: %s', line_no, line.strip()) + continue similarity_gold.append(sim) # Similarity from the dataset similarity_model.append(self.similarity(a, b)) # Similarity from the model - self.key_to_index = original_key_to_index + finally: + self.key_to_index = original_key_to_index + + assert len(similarity_gold) == len(similarity_model) + if not similarity_gold: + raise ValueError( + f"No valid similarity judgements found in {pairs}: either invalid format or " + f"all are out-of-vocabulary in {self}" + ) spearman = stats.spearmanr(similarity_gold, similarity_model) pearson = stats.pearsonr(similarity_gold, similarity_model) if dummy4unknown: @@ -1804,7 +1898,7 @@ def __lt__(self, other): # used for sorting in a priority queue def __str__(self): vals = ['%s:%r' % (key, self.__dict__[key]) for key in sorted(self.__dict__) if not key.startswith('_')] - return "%s(%s)" % (self.__class__.__name__, ', '.join(vals)) + return "%s<%s>" % (self.__class__.__name__, ', '.join(vals)) # compatibility alias, allowing older pickle-based `.save()`s to load @@ -1833,7 +1927,7 @@ def _add_word_to_kv(kv, counts, word, weights, vocab_size): kv.set_vecattr(word, 'count', word_count) -def _add_bytes_to_kv(kv, counts, chunk, vocab_size, vector_size, datatype, unicode_errors): +def _add_bytes_to_kv(kv, counts, chunk, vocab_size, vector_size, datatype, unicode_errors, encoding): start = 0 processed_words = 0 bytes_per_vector = vector_size * dtype(REAL).itemsize @@ -1846,7 +1940,7 @@ def _add_bytes_to_kv(kv, counts, chunk, vocab_size, vector_size, datatype, unico if i_space == -1 or (len(chunk) - i_vector) < bytes_per_vector: break - word = chunk[start:i_space].decode("utf-8", errors=unicode_errors) + word = chunk[start:i_space].decode(encoding, errors=unicode_errors) # Some binary files are reported to have obsolete new line in the beginning of word, remove it word = word.lstrip('\n') vector = frombuffer(chunk, offset=i_vector, count=vector_size, dtype=REAL).astype(datatype) @@ -1857,7 +1951,10 @@ def _add_bytes_to_kv(kv, counts, chunk, vocab_size, vector_size, datatype, unico return processed_words, chunk[start:] -def _word2vec_read_binary(fin, kv, counts, vocab_size, vector_size, datatype, unicode_errors, binary_chunk_size): +def _word2vec_read_binary( + fin, kv, counts, vocab_size, vector_size, datatype, unicode_errors, binary_chunk_size, + encoding="utf-8", + ): chunk = b'' tot_processed_words = 0 @@ -1865,7 +1962,7 @@ def _word2vec_read_binary(fin, kv, counts, vocab_size, vector_size, datatype, un new_chunk = fin.read(binary_chunk_size) chunk += new_chunk processed_words, chunk = _add_bytes_to_kv( - kv, counts, chunk, vocab_size, vector_size, datatype, unicode_errors) + kv, counts, chunk, vocab_size, vector_size, datatype, unicode_errors, encoding) tot_processed_words += processed_words if len(new_chunk) < binary_chunk_size: break @@ -1970,7 +2067,7 @@ def _load_word2vec_format( if binary: _word2vec_read_binary( - fin, kv, counts, vocab_size, vector_size, datatype, unicode_errors, binary_chunk_size, + fin, kv, counts, vocab_size, vector_size, datatype, unicode_errors, binary_chunk_size, encoding ) else: _word2vec_read_text(fin, kv, counts, vocab_size, vector_size, datatype, unicode_errors, encoding) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 6691ddcc31..10a0c60134 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -615,8 +615,8 @@ def __str__(self): Human readable representation of the most important model parameters. """ - return "LdaModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % ( - self.num_terms, self.num_topics, self.decay, self.chunksize + return "%s" % ( + self.__class__.__name__, self.num_terms, self.num_topics, self.decay, self.chunksize ) def sync_state(self, current_Elogbeta=None): diff --git a/gensim/models/logentropy_model.py b/gensim/models/logentropy_model.py index a79c685660..16fbace8d2 100644 --- a/gensim/models/logentropy_model.py +++ b/gensim/models/logentropy_model.py @@ -76,7 +76,7 @@ def __init__(self, corpus, normalize=True): self.initialize(corpus) def __str__(self): - return "LogEntropyModel(n_docs=%s, n_words=%s)" % (self.n_docs, self.n_words) + return "%s" % (self.__class__.__name__, self.n_docs, self.n_words) def initialize(self, corpus): """Calculates the global weighting for all terms in a given corpus and transforms the simple diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index 06055722e1..0bdb9f9bf9 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -23,7 +23,7 @@ * distributed computing for very large corpora, making use of a cluster of machines -Wall-clock `performance on the English Wikipedia `_ +Wall-clock `performance on the English Wikipedia `_ (2G corpus positions, 3.2M documents, 100K features, 0.5G non-zero entries in the final TF-IDF matrix), requesting the top 400 LSI factors: @@ -70,6 +70,7 @@ from gensim import interfaces, matutils, utils from gensim.models import basemodel +from gensim.utils import is_empty logger = logging.getLogger(__name__) @@ -162,8 +163,11 @@ class Projection(utils.SaveLoad): via :meth:`~gensim.models.lsimodel.Projection.merge`. This is how incremental training actually happens. """ - def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS, - extra_dims=P2_EXTRA_DIMS, dtype=np.float64): + + def __init__( + self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS, + extra_dims=P2_EXTRA_DIMS, dtype=np.float64, random_seed=None, + ): """Construct the (U, S) projection from a corpus. Parameters @@ -183,11 +187,15 @@ def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITER Extra samples to be used besides the rank `k`. Tune to improve accuracy. dtype : numpy.dtype, optional Enforces a type for elements of the decomposed matrix. + random_seed: {None, int}, optional + Random seed used to initialize the pseudo-random number generator, + a local instance of numpy.random.RandomState instance. """ self.m, self.k = m, k self.power_iters = power_iters self.extra_dims = extra_dims + self.random_seed = random_seed if docs is not None: # base case decomposition: given a job `docs`, compute its decomposition, # *in-core*. @@ -195,7 +203,7 @@ def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITER u, s = stochastic_svd( docs, k, chunksize=sys.maxsize, num_terms=m, power_iters=self.power_iters, - extra_dims=self.extra_dims, dtype=dtype) + extra_dims=self.extra_dims, dtype=dtype, random_seed=self.random_seed) else: try: import sparsesvd @@ -223,7 +231,10 @@ def empty_like(self): An empty copy (without corpus) of the current projection. """ - return Projection(self.m, self.k, power_iters=self.power_iters, extra_dims=self.extra_dims) + return Projection( + self.m, self.k, power_iters=self.power_iters, + extra_dims=self.extra_dims, random_seed=self.random_seed, + ) def merge(self, other, decay=1.0): """Merge current :class:`~gensim.models.lsimodel.Projection` instance with another. @@ -354,9 +365,9 @@ class LsiModel(interfaces.TransformationABC, basemodel.BaseTopicModel): def __init__( self, corpus=None, num_topics=200, id2word=None, chunksize=20000, - decay=1.0, distributed=False, onepass=True, - power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS, dtype=np.float64 - ): + decay=1.0, distributed=False, onepass=True, power_iters=P2_EXTRA_ITERS, + extra_samples=P2_EXTRA_DIMS, dtype=np.float64, random_seed=None, + ): """Build an LSI model. Parameters @@ -383,6 +394,9 @@ def __init__( Extra samples to be used besides the rank `k`. Can improve accuracy. dtype : type, optional Enforces a type for elements of the decomposed matrix. + random_seed: {None, int}, optional + Random seed used to initialize the pseudo-random number generator, + a local instance of numpy.random.RandomState instance. """ self.id2word = id2word @@ -396,6 +410,7 @@ def __init__( self.onepass = onepass self.extra_samples, self.power_iters = extra_samples, power_iters self.dtype = dtype + self.random_seed = random_seed if corpus is None and self.id2word is None: raise ValueError( @@ -411,7 +426,8 @@ def __init__( self.docs_processed = 0 self.projection = Projection( - self.num_terms, self.num_topics, power_iters=self.power_iters, extra_dims=self.extra_samples, dtype=dtype + self.num_terms, self.num_topics, power_iters=self.power_iters, + extra_dims=self.extra_samples, dtype=dtype, random_seed=self.random_seed ) self.numworkers = 1 @@ -474,15 +490,20 @@ def add_documents(self, corpus, chunksize=None, decay=None): chunksize = self.chunksize if decay is None: decay = self.decay - + if is_empty(corpus): + logger.warning('LsiModel.add_documents() called but no documents provided, is this intended?') if not scipy.sparse.issparse(corpus): if not self.onepass: # we are allowed multiple passes over the input => use a faster, randomized two-pass algo - update = Projection(self.num_terms, self.num_topics, None, dtype=self.dtype) + update = Projection( + self.num_terms, self.num_topics, None, + dtype=self.dtype, random_seed=self.random_seed, + ) update.u, update.s = stochastic_svd( corpus, self.num_topics, num_terms=self.num_terms, chunksize=chunksize, - extra_dims=self.extra_samples, power_iters=self.power_iters, dtype=self.dtype + extra_dims=self.extra_samples, power_iters=self.power_iters, dtype=self.dtype, + random_seed=self.random_seed, ) self.projection.merge(update, decay=decay) self.docs_processed += len(corpus) if hasattr(corpus, '__len__') else 0 @@ -499,7 +520,9 @@ def add_documents(self, corpus, chunksize=None, decay=None): # definitely avoid materializing it as a dense matrix! logger.debug("converting corpus to csc format") job = matutils.corpus2csc( - chunk, num_docs=len(chunk), num_terms=self.num_terms, num_nnz=nnz, dtype=self.dtype) + chunk, num_docs=len(chunk), num_terms=self.num_terms, + num_nnz=nnz, dtype=self.dtype, + ) del chunk doc_no += job.shape[1] if self.dispatcher: @@ -513,7 +536,7 @@ def add_documents(self, corpus, chunksize=None, decay=None): # serial version, there is only one "worker" (myself) => process the job directly update = Projection( self.num_terms, self.num_topics, job, extra_dims=self.extra_samples, - power_iters=self.power_iters, dtype=self.dtype + power_iters=self.power_iters, dtype=self.dtype, random_seed=self.random_seed, ) del job self.projection.merge(update, decay=decay) @@ -530,7 +553,7 @@ def add_documents(self, corpus, chunksize=None, decay=None): assert not self.dispatcher, "must be in serial mode to receive jobs" update = Projection( self.num_terms, self.num_topics, corpus.tocsc(), extra_dims=self.extra_samples, - power_iters=self.power_iters, dtype=self.dtype + power_iters=self.power_iters, dtype=self.dtype, ) self.projection.merge(update, decay=decay) logger.info("processed sparse job of %i documents", corpus.shape[1]) @@ -545,8 +568,8 @@ def __str__(self): A human readable string of the current objects parameters. """ - return "LsiModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % ( - self.num_terms, self.num_topics, self.decay, self.chunksize + return "%s" % ( + self.__class__.__name__, self.num_terms, self.num_topics, self.decay, self.chunksize ) def __getitem__(self, bow, scaled=False, chunksize=512): @@ -569,7 +592,8 @@ def __getitem__(self, bow, scaled=False, chunksize=512): Latent representation of corpus in BoW format if `bow` is corpus. """ - assert self.projection.u is not None, "decomposition not initialized yet" + if self.projection.u is None: + raise ValueError('No training data provided - LSI model not initialized yet') # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) @@ -731,7 +755,7 @@ def print_debug(self, num_topics=5, num_words=10): print_debug( self.id2word, self.projection.u, self.projection.s, range(min(num_topics, len(self.projection.u.T))), - num_words=num_words + num_words=num_words, ) def save(self, fname, *args, **kwargs): @@ -864,8 +888,10 @@ def print_debug(id2token, u, s, topics, num_words=10, num_neg=None): logger.info('topic #%s(%.3f): %s, ..., %s', topic, s[topic], ', '.join(pos), ', '.join(neg)) -def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, - power_iters=0, dtype=np.float64, eps=1e-6): +def stochastic_svd( + corpus, rank, num_terms, chunksize=20000, extra_dims=None, + power_iters=0, dtype=np.float64, eps=1e-6, random_seed=None, + ): """Run truncated Singular Value Decomposition (SVD) on a sparse input. Parameters @@ -888,6 +914,10 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, Enforces a type for elements of the decomposed matrix. eps: float, optional Percentage of the spectrum's energy to be discarded. + random_seed: {None, int}, optional + Random seed used to initialize the pseudo-random number generator, + a local instance of numpy.random.RandomState instance. + Notes ----- @@ -924,13 +954,16 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, # and more memory friendly than processing all documents at once) y = np.zeros(dtype=dtype, shape=(num_terms, samples)) logger.info("1st phase: constructing %s action matrix", str(y.shape)) + random_state = np.random.RandomState(random_seed) if scipy.sparse.issparse(corpus): m, n = corpus.shape - assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % (m, num_terms) - o = np.random.normal(0.0, 1.0, (n, samples)).astype(y.dtype) # draw a random gaussian matrix - sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices, - corpus.data, o.ravel(), y.ravel()) # y = corpus * o + assert num_terms == m, f"mismatch in number of features: {m} in sparse matrix vs. {num_terms} parameter" + o = random_state.normal(0.0, 1.0, (n, samples)).astype(y.dtype) # draw a random gaussian matrix + sparsetools.csc_matvecs( + m, n, samples, corpus.indptr, corpus.indices, + corpus.data, o.ravel(), y.ravel(), + ) # y = corpus * o del o # unlike np, scipy.sparse `astype()` copies everything, even if there is no change to dtype! @@ -960,10 +993,10 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, assert n <= chunksize # the very last chunk of A is allowed to be smaller in size num_docs += n logger.debug("multiplying chunk * gauss") - o = np.random.normal(0.0, 1.0, (n, samples)).astype(dtype) # draw a random gaussian matrix + o = random_state.normal(0.0, 1.0, (n, samples), ).astype(dtype) # draw a random gaussian matrix sparsetools.csc_matvecs( m, n, samples, chunk.indptr, chunk.indices, # y = y + chunk * o - chunk.data, o.ravel(), y.ravel() + chunk.data, o.ravel(), y.ravel(), ) del chunk, o y = [y] diff --git a/gensim/models/normmodel.py b/gensim/models/normmodel.py index 3292f6514e..62cbfc8fef 100644 --- a/gensim/models/normmodel.py +++ b/gensim/models/normmodel.py @@ -41,7 +41,9 @@ def __init__(self, corpus=None, norm='l2'): pass def __str__(self): - return "NormModel(num_docs=%s, num_nnz=%s, norm=%s)" % (self.num_docs, self.num_nnz, self.norm) + return "%s" % ( + self.__class__.__name__, self.num_docs, self.num_nnz, self.norm + ) def calc_norm(self, corpus): """Calculate the norm by calling :func:`~gensim.matutils.unitvec` with the norm parameter. diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index 1b2bf9fbb2..cbdaf4cb55 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -70,7 +70,7 @@ def __init__(self, corpus, id2word=None, num_topics=300): self.add_lifecycle_event("created", msg=f"created {self}") def __str__(self): - return "RpModel(num_terms=%s, num_topics=%s)" % (self.num_terms, self.num_topics) + return "%s" % (self.__class__.__name__, self.num_terms, self.num_topics) def initialize(self, corpus): """Initialize the random projection matrix. diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 4152f3eb3d..cf2c3d3e1a 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -6,10 +6,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """This module implements functionality related to the `Term Frequency - Inverse Document Frequency -` vector space bag-of-words models. - -For a more in-depth exposition of TF-IDF and its various SMART variants (normalization, weighting schemes), -see the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/ +`_ class of bag-of-words vector space models. """ @@ -347,11 +344,6 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden See also the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/. - See Also - -------- - ~gensim.sklearn_api.tfidf.TfIdfTransformer : Class that also uses the SMART scheme. - resolve_weights : Function that also uses the SMART scheme. - References ---------- .. [1] Singhal, A., Buckley, C., & Mitra, M. (1996). `Pivoted Document Length @@ -435,7 +427,7 @@ def load(cls, *args, **kwargs): return model def __str__(self): - return "TfidfModel(num_docs=%s, num_nnz=%s)" % (self.num_docs, self.num_nnz) + return "%s" % (self.__class__.__name__, self.num_docs, self.num_nnz) def initialize(self, corpus): """Compute inverse document weights, which will be used to modify term frequencies for documents. diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 356f711408..061dcfc817 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -200,6 +200,7 @@ from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector from gensim import utils, matutils +from smart_open.compression import get_supported_extensions logger = logging.getLogger(__name__) @@ -445,7 +446,7 @@ def __init__( def build_vocab( self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs, - ): + ): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Parameters @@ -660,7 +661,7 @@ def prepare_vocab( "prepare_vocab", msg=( f"effective_min_count={self.effective_min_count} retains {len(retain_words)} unique " - f"words ({retain_unique_pct}%% of original {original_unique_total}, drops {drop_unique})" + f"words ({retain_unique_pct:.2f}% of original {original_unique_total}, drops {drop_unique})" ), ) @@ -670,7 +671,7 @@ def prepare_vocab( "prepare_vocab", msg=( f"effective_min_count={self.effective_min_count} leaves {retain_total} word corpus " - f"({retain_pct}%% of original {original_total}, drops {drop_total})" + f"({retain_pct:.2f}% of original {original_total}, drops {drop_total})" ), ) else: @@ -705,9 +706,9 @@ def prepare_vocab( self.add_lifecycle_event( "prepare_vocab", msg=( - f"added {len(new_words)} new unique words ({new_unique_pct}%% of original " + f"added {len(new_words)} new unique words ({new_unique_pct:.2f}% of original " f"{original_unique_total}) and increased the count of {len(pre_exist_words)} " - f"pre-existing words ({pre_exist_unique_pct}%% of original {original_unique_total})" + f"pre-existing words ({pre_exist_unique_pct:.2f}% of original {original_unique_total})" ), ) retain_words = new_words + pre_exist_words @@ -833,11 +834,11 @@ def make_cum_table(self, domain=2**31 - 1): train_words_pow = 0.0 for word_index in range(vocab_size): count = self.wv.get_vecattr(word_index, 'count') - train_words_pow += count**self.ns_exponent + train_words_pow += count**float(self.ns_exponent) cumulative = 0.0 for word_index in range(vocab_size): count = self.wv.get_vecattr(word_index, 'count') - cumulative += count**self.ns_exponent + cumulative += count**float(self.ns_exponent) self.cum_table[word_index] = round(cumulative / train_words_pow * domain) if len(self.cum_table) > 0: assert self.cum_table[-1] == domain @@ -1285,7 +1286,7 @@ def _log_epoch_progress( report = progress_queue.get() # blocks if workers too slow if report is None: # a thread reporting that it finished unfinished_worker_count -= 1 - logger.info("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) + logger.debug("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) continue examples, trained_words, raw_words = report job_tally += 1 @@ -1502,6 +1503,14 @@ def _check_corpus_sanity(self, corpus_iterable=None, corpus_file=None, passes=1) raise TypeError( f"Using a generator as corpus_iterable can't support {passes} passes. Try a re-iterable sequence.") + if corpus_iterable is None: + _, corpus_ext = os.path.splitext(corpus_file) + if corpus_ext.lower() in get_supported_extensions(): + raise TypeError( + f"Training from compressed files is not supported with the `corpus_path` argument. " + f"Please decompress {corpus_file} or use `corpus_iterable` instead." + ) + def _check_training_sanity(self, epochs=0, total_examples=None, total_words=None, **kwargs): """Checks whether the training parameters make sense. @@ -1584,14 +1593,14 @@ def _log_progress( # examples-based progress % logger.info( "EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i", - cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed, + cur_epoch, 100.0 * example_count / total_examples, trained_word_count / elapsed, -1 if job_queue is None else utils.qsize(job_queue), utils.qsize(progress_queue) ) else: # words-based progress % logger.info( "EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i", - cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed, + cur_epoch, 100.0 * raw_word_count / total_words, trained_word_count / elapsed, -1 if job_queue is None else utils.qsize(job_queue), utils.qsize(progress_queue) ) @@ -1627,8 +1636,8 @@ def _log_epoch_end( """ logger.info( - "EPOCH - %i : training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", - cur_epoch + 1, raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed, + "EPOCH %i: training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", + cur_epoch, raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed, ) # don't warn if training in file-based mode, because it's expected behavior @@ -1638,12 +1647,12 @@ def _log_epoch_end( # check that the input corpus hasn't changed during iteration if total_examples and total_examples != example_count: logger.warning( - "EPOCH - %i : supplied example count (%i) did not equal expected count (%i)", cur_epoch + 1, + "EPOCH %i: supplied example count (%i) did not equal expected count (%i)", cur_epoch, example_count, total_examples ) if total_words and total_words != raw_word_count: logger.warning( - "EPOCH - %i : supplied raw word count (%i) did not equal expected count (%i)", cur_epoch + 1, + "EPOCH %i: supplied raw word count (%i) did not equal expected count (%i)", cur_epoch, raw_word_count, total_words ) @@ -1882,7 +1891,7 @@ def __str__(self): and learning rate. """ - return "%s(vocab=%s, vector_size=%s, alpha=%s)" % ( + return "%s" % ( self.__class__.__name__, len(self.wv.index_to_key), self.wv.vector_size, self.alpha, ) diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index db66db67e0..cdb966547d 100644 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -147,7 +147,7 @@ def __getstate__(self): return result def __str__(self): - return "%s Shard(%i documents in %s)" % (self.cls.__name__, len(self), self.fullname()) + return "%s<%i documents in %s>" % (self.cls.__name__, len(self), self.fullname()) def get_index(self): """Load & get index. @@ -359,8 +359,8 @@ def __len__(self): return len(self.fresh_docs) + sum(len(shard) for shard in self.shards) def __str__(self): - return "Similarity index with %i documents in %i shards (stored under %s)" % ( - len(self), len(self.shards), self.output_prefix + return "%s<%i documents in %i shards stored under %s>" % ( + self.__class__.__name__, len(self), len(self.shards), self.output_prefix ) def add_documents(self, corpus): @@ -1015,7 +1015,7 @@ class WmdSimilarity(interfaces.SimilarityABC): >>> >>> model = Word2Vec(common_texts, vector_size=20, min_count=1) # train word-vectors >>> - >>> index = WmdSimilarity(common_texts, model) + >>> index = WmdSimilarity(common_texts, model.wv) >>> # Make query. >>> query = ['trees'] >>> sims = index[query] @@ -1096,7 +1096,7 @@ def get_similarities(self, query): return result def __str__(self): - return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.w2v_model.wv.syn0.shape[1]) + return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.wv.vectors.shape[1]) class SparseMatrixSimilarity(interfaces.SimilarityABC): diff --git a/gensim/similarities/fastss.pyx b/gensim/similarities/fastss.pyx index a4e8cba54b..e47a5442b2 100644 --- a/gensim/similarities/fastss.pyx +++ b/gensim/similarities/fastss.pyx @@ -137,6 +137,15 @@ def bytes2set(b): class FastSS: + """ + Fast implementation of FastSS (Fast Similarity Search): https://fastss.csg.uzh.ch/ + + FastSS enables fuzzy search of a dynamic query (a word, string) against a static + dictionary (a set of words, strings). The "fuziness" is configurable by means + of a maximum edit distance (Levenshtein) between the query string and any of the + dictionary words. + + """ def __init__(self, words=None, max_dist=2): """ diff --git a/gensim/similarities/levenshtein.py b/gensim/similarities/levenshtein.py index 51da72c065..768429a62a 100644 --- a/gensim/similarities/levenshtein.py +++ b/gensim/similarities/levenshtein.py @@ -29,7 +29,7 @@ class LevenshteinSimilarityIndex(TermSimilarityIndex): "Levenshtein similarity" is a modification of the Levenshtein (edit) distance, defined in [charletetal17]_. - This implementation uses the FastSS neighbourhood algorithm + This implementation uses the :class:`~gensim.similarities.fastss.FastSS` algorithm for fast kNN nearest-neighbor retrieval. Parameters diff --git a/gensim/similarities/termsim.py b/gensim/similarities/termsim.py index d2a3f6728f..f97801ca66 100644 --- a/gensim/similarities/termsim.py +++ b/gensim/similarities/termsim.py @@ -61,7 +61,7 @@ def most_similar(self, term, topn=10): def __str__(self): members = ', '.join('%s=%s' % pair for pair in vars(self).items()) - return '%s(%s)' % (self.__class__.__name__, members) + return '%s<%s>' % (self.__class__.__name__, members) class UniformTermSimilarityIndex(TermSimilarityIndex): diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py index 9396fe5ac0..2b111f7306 100644 --- a/gensim/test/test_coherencemodel.py +++ b/gensim/test/test_coherencemodel.py @@ -50,6 +50,11 @@ def setUp(self): ['not a token', 'not an id', 'tests using', "this list"], ['should raise', 'an error', 'to pass', 'correctly'] ] + # list of topics with unseen words in the dictionary + self.topics5 = [ + ['aaaaa', 'bbbbb', 'ccccc', 'eeeee'], + ['ddddd', 'fffff', 'ggggh', 'hhhhh'] + ] self.topicIds1 = [] for topic in self.topics1: self.topicIds1.append([self.dictionary.token2id[token] for token in topic]) @@ -70,8 +75,14 @@ def check_coherence_measure(self, coherence): cm2 = CoherenceModel(topics=self.topics2, **kwargs) cm3 = CoherenceModel(topics=self.topics3, **kwargs) cm4 = CoherenceModel(topics=self.topicIds1, **kwargs) + + # check if the same topic always returns the same coherence value + cm5 = CoherenceModel(topics=[self.topics1[0]], **kwargs) + self.assertRaises(ValueError, lambda: CoherenceModel(topics=self.topics4, **kwargs)) + self.assertRaises(ValueError, lambda: CoherenceModel(topics=self.topics5, **kwargs)) self.assertEqual(cm1.get_coherence(), cm4.get_coherence()) + self.assertEqual(cm1.get_coherence_per_topic()[0], cm5.get_coherence()) self.assertIsInstance(cm3.get_coherence(), np.double) self.assertGreater(cm1.get_coherence(), cm2.get_coherence()) diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index 6c09ea2d1f..431b07c0ce 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -27,7 +27,7 @@ from gensim.test.utils import datapath, get_tmpfile, common_corpus -AZURE = bool(os.environ.get('PIPELINE_WORKSPACE')) +GITHUB_ACTIONS_WINDOWS = os.environ.get('RUNNER_OS') == 'Windows' class DummyTransformer: @@ -62,7 +62,7 @@ def tearDown(self): except OSError: pass - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_load(self): fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) corpus = self.corpus_class(fname) @@ -71,7 +71,7 @@ def test_load(self): # the deerwester corpus always has nine documents self.assertEqual(len(docs), 9) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_len(self): fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) corpus = self.corpus_class(fname) @@ -87,7 +87,7 @@ def test_len(self): self.assertEqual(len(corpus), 9) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_empty_input(self): tmpf = get_tmpfile('gensim_corpus.tst') with open(tmpf, 'w') as f: @@ -102,7 +102,7 @@ def test_empty_input(self): docs = list(corpus) self.assertEqual(len(docs), 0) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_save(self): corpus = self.TEST_CORPUS tmpf = get_tmpfile('gensim_corpus.tst') @@ -114,7 +114,7 @@ def test_save(self): corpus2 = list(self.corpus_class(tmpf)) self.assertEqual(corpus, corpus2) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_serialize(self): corpus = self.TEST_CORPUS tmpf = get_tmpfile('gensim_corpus.tst') @@ -136,7 +136,7 @@ def test_serialize(self): idx = [1, 3, 5, 7] self.assertEqual(corpus[idx], corpus2[idx]) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_serialize_compressed(self): corpus = self.TEST_CORPUS tmpf = get_tmpfile('gensim_corpus.tst') @@ -154,7 +154,7 @@ def test_serialize_compressed(self): for i in range(len(corpus)): self.assertEqual(corpus[i], corpus2[i]) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_switch_id2word(self): fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) corpus = self.corpus_class(fname) @@ -172,7 +172,7 @@ def test_switch_id2word(self): testdoc2 = set((to_unicode(corpus.id2word[x]), y) for x, y in firstdoc2) self.assertEqual(testdoc2, {('computer', 1), ('human', 1), ('interface', 1)}) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_indexing(self): fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) corpus = self.corpus_class(fname) @@ -245,7 +245,7 @@ def test_closed_file_object(self): self.assertEqual(f, 0) self.assertEqual(s, 0) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_load(self): self.assertEqual(self.corpus.num_docs, 9) self.assertEqual(self.corpus.num_terms, 12) diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index c8b7516c99..a7e1fa58df 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -720,6 +720,15 @@ def test_train_warning(self, loglines): def test_load_on_class_error(self): """Test if exception is raised when loading doc2vec model on instance""" self.assertRaises(AttributeError, load_on_instance) + + def test_negative_ns_exp(self): + """The model should accept a negative ns_exponent as a valid value.""" + model = doc2vec.Doc2Vec(sentences, ns_exponent=-1, min_count=1, workers=1) + tmpf = get_tmpfile('d2v_negative_exp.tst') + model.save(tmpf) + loaded_model = doc2vec.Doc2Vec.load(tmpf) + loaded_model.train(sentences, total_examples=model.corpus_count, epochs=1) + assert loaded_model.ns_exponent == -1, loaded_model.ns_exponent # endclass TestDoc2VecModel diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index efc6a3ca8e..ecc44a30e4 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -8,6 +8,7 @@ import logging import unittest import os +import shutil import subprocess import struct import sys @@ -44,7 +45,7 @@ BUCKET = 10000 FT_HOME = os.environ.get("FT_HOME") -FT_CMD = os.path.join(FT_HOME, "fasttext") if FT_HOME else None +FT_CMD = shutil.which("fasttext", path=FT_HOME) or shutil.which("fasttext") new_sentences = [ @@ -372,6 +373,9 @@ def test_most_similar_cosmul(self): self.assertEqual( self.test_model.wv.most_similar_cosmul('nights'), self.test_model.wv.most_similar_cosmul(positive=['nights'])) + self.assertEqual( + self.test_model.wv.most_similar_cosmul('the', 'and'), + self.test_model.wv.most_similar_cosmul(positive=['the'], negative=['and'])) def test_lookup(self): # In vocab, sanity check @@ -762,6 +766,15 @@ def test_vectors_for_all_without_inference(self): predicted = vectors_for_all['responding'] assert np.allclose(expected, predicted) + def test_negative_ns_exp(self): + """The model should accept a negative ns_exponent as a valid value.""" + model = FT_gensim(sentences, ns_exponent=-1, min_count=1, workers=1) + tmpf = get_tmpfile('fasttext_negative_exp.tst') + model.save(tmpf) + loaded_model = FT_gensim.load(tmpf) + loaded_model.train(sentences, total_examples=model.corpus_count, epochs=1) + assert loaded_model.ns_exponent == -1, loaded_model.ns_exponent + @pytest.mark.parametrize('shrink_windows', [True, False]) def test_cbow_hs_training(shrink_windows): @@ -1652,7 +1665,7 @@ def _save_test_model(out_base_fname, model_params): subprocess.check_call(cmd) -@unittest.skipIf(not FT_HOME, "FT_HOME env variable not set, skipping test") +@unittest.skipIf(not FT_CMD, "fasttext not in FT_HOME or PATH, skipping test") class SaveFacebookByteIdentityTest(unittest.TestCase): """ This class containts tests that check the following scenario: @@ -1699,7 +1712,7 @@ def line_to_array(line): return np.array([line_to_array(line) for line in out.splitlines()], dtype=np.float32) -@unittest.skipIf(not os.environ.get("FT_HOME", None), "FT_HOME env variable not set, skipping test") +@unittest.skipIf(not FT_CMD, "fasttext not in FT_HOME or PATH, skipping test") class SaveFacebookFormatReadingTest(unittest.TestCase): """ This class containts tests that check the following scenario: diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py index d5eda547ea..cc70577842 100644 --- a/gensim/test/test_keyedvectors.py +++ b/gensim/test/test_keyedvectors.py @@ -366,6 +366,35 @@ def test_no_header(self): self.assertEqual(randkv.index_to_key, reloadtxtkv.index_to_key) self.assertTrue((randkv.vectors == reloadtxtkv.vectors).all()) + def test_get_mean_vector(self): + """Test get_mean_vector returns expected results.""" + keys = [ + 'conflict', + 'administration', + 'terrorism', + 'call', + 'an out-of-vocabulary word', + ] + weights = [1, 2, 3, 1, 2] + expected_result_1 = np.array([ + 0.02000151, -0.12685453, 0.09196121, 0.25514853, 0.25740655, + -0.11134843, -0.0502661, -0.19278568, -0.83346179, -0.12068878, + ], dtype=np.float32) + expected_result_2 = np.array([ + -0.0145228, -0.11530358, 0.1169825, 0.22537769, 0.29353586, + -0.10458107, -0.05272481, -0.17547795, -0.84245106, -0.10356515, + ], dtype=np.float32) + expected_result_3 = np.array([ + 0.01343237, -0.47651053, 0.45645328, 0.98304356, 1.1840123, + -0.51647933, -0.25308795, -0.77931081, -3.55954733, -0.55429711, + ], dtype=np.float32) + + self.assertTrue(np.allclose(self.vectors.get_mean_vector(keys), expected_result_1)) + self.assertTrue(np.allclose(self.vectors.get_mean_vector(keys, weights), expected_result_2)) + self.assertTrue(np.allclose( + self.vectors.get_mean_vector(keys, pre_normalize=False), expected_result_3) + ) + class Gensim320Test(unittest.TestCase): def test(self): diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index b809b39754..297006b75f 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -24,7 +24,7 @@ from gensim.test import basetmtests from gensim.test.utils import datapath, get_tmpfile, common_texts -AZURE = bool(os.environ.get('PIPELINE_WORKSPACE')) +GITHUB_ACTIONS_WINDOWS = os.environ.get('RUNNER_OS') == 'Windows' dictionary = Dictionary(common_texts) corpus = [dictionary.doc2bow(text) for text in common_texts] @@ -232,7 +232,7 @@ def test_get_topic_terms(self): self.assertTrue(isinstance(k, numbers.Integral)) self.assertTrue(np.issubdtype(v, np.floating)) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_get_document_topics(self): model = self.class_( diff --git a/gensim/test/test_ldaseqmodel.py b/gensim/test/test_ldaseqmodel.py index 0c18b2cf8c..90f6977410 100644 --- a/gensim/test/test_ldaseqmodel.py +++ b/gensim/test/test_ldaseqmodel.py @@ -214,7 +214,7 @@ def test_topic_word(self): topics = self.ldaseq.print_topics(0) expected_topic_word = [('skills', 0.035999999999999997)] self.assertEqual(topics[0][0][0], expected_topic_word[0][0]) - self.assertAlmostEqual(topics[0][0][1], expected_topic_word[0][1], places=2) + self.assertAlmostEqual(topics[0][0][1], expected_topic_word[0][1], delta=0.0012) # testing document-topic proportions def test_doc_topic(self): diff --git a/gensim/test/test_lee.py b/gensim/test/test_lee.py index 1eadd398a9..c8a592d539 100644 --- a/gensim/test/test_lee.py +++ b/gensim/test/test_lee.py @@ -25,7 +25,6 @@ from __future__ import with_statement import logging -import os.path import unittest from functools import partial @@ -34,6 +33,7 @@ from gensim import corpora, models, utils, matutils from gensim.parsing.preprocessing import preprocess_documents, preprocess_string, DEFAULT_FILTERS +from gensim.test.utils import datapath bg_corpus = None corpus = None @@ -45,24 +45,23 @@ def setUp(self): """setup lee test corpora""" global bg_corpus, corpus, human_sim_vector, bg_corpus2, corpus2 - pre_path = os.path.join(os.path.dirname(__file__), 'test_data') - bg_corpus_file = 'lee_background.cor' - corpus_file = 'lee.cor' - sim_file = 'similarities0-1.txt' + bg_corpus_file = datapath('lee_background.cor') + corpus_file = datapath('lee.cor') + sim_file = datapath('similarities0-1.txt') # read in the corpora latin1 = partial(utils.to_unicode, encoding='latin1') - with utils.open(os.path.join(pre_path, bg_corpus_file), 'rb') as f: + with utils.open(bg_corpus_file, 'rb') as f: bg_corpus = preprocess_documents(latin1(line) for line in f) - with utils.open(os.path.join(pre_path, corpus_file), 'rb') as f: + with utils.open(corpus_file, 'rb') as f: corpus = preprocess_documents(latin1(line) for line in f) - with utils.open(os.path.join(pre_path, bg_corpus_file), 'rb') as f: + with utils.open(bg_corpus_file, 'rb') as f: bg_corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f] - with utils.open(os.path.join(pre_path, corpus_file), 'rb') as f: + with utils.open(corpus_file, 'rb') as f: corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f] # read the human similarity data - sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file)) + sim_matrix = np.loadtxt(sim_file) sim_m_size = np.shape(sim_matrix)[0] human_sim_vector = sim_matrix[np.triu_indices(sim_m_size, 1)] diff --git a/gensim/test/test_matutils.py b/gensim/test/test_matutils.py index 97e4189d89..5c5f14398e 100644 --- a/gensim/test/test_matutils.py +++ b/gensim/test/test_matutils.py @@ -7,7 +7,9 @@ import logging import unittest import numpy as np +from numpy.testing import assert_array_equal from scipy import sparse +from scipy.sparse import csc_matrix from scipy.special import psi # gamma function utils import gensim.matutils as matutils @@ -266,6 +268,44 @@ def test_return_norm_zero_vector_gensim_sparse(self): self.assertEqual(norm, 1.0) +class TestSparse2Corpus(unittest.TestCase): + def setUp(self): + self.orig_array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + self.s2c = matutils.Sparse2Corpus(csc_matrix(self.orig_array)) + + def test_getitem_slice(self): + assert_array_equal(self.s2c[:2].sparse.toarray(), self.orig_array[:, :2]) + assert_array_equal(self.s2c[1:3].sparse.toarray(), self.orig_array[:, 1:3]) + + def test_getitem_index(self): + self.assertListEqual(self.s2c[1], [(0, 2), (1, 5), (2, 8)]) + + def test_getitem_list_of_indices(self): + assert_array_equal( + self.s2c[[1, 2]].sparse.toarray(), self.orig_array[:, [1, 2]] + ) + assert_array_equal(self.s2c[[1]].sparse.toarray(), self.orig_array[:, [1]]) + + def test_getitem_ndarray(self): + assert_array_equal( + self.s2c[np.array([1, 2])].sparse.toarray(), self.orig_array[:, [1, 2]] + ) + assert_array_equal( + self.s2c[np.array([1])].sparse.toarray(), self.orig_array[:, [1]] + ) + + def test_getitem_range(self): + assert_array_equal( + self.s2c[range(1, 3)].sparse.toarray(), self.orig_array[:, [1, 2]] + ) + assert_array_equal( + self.s2c[range(1, 2)].sparse.toarray(), self.orig_array[:, [1]] + ) + + def test_getitem_ellipsis(self): + assert_array_equal(self.s2c[...].sparse.toarray(), self.orig_array) + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py index 35ddd03397..0b917980d2 100644 --- a/gensim/test/test_similarities.py +++ b/gensim/test/test_similarities.py @@ -373,6 +373,11 @@ def test_iter(self): self.assertTrue(numpy.alltrue(sims >= 0.0)) self.assertTrue(numpy.alltrue(sims <= 1.0)) + @unittest.skipIf(PYEMD_EXT is False, "pyemd not installed") + def test_str(self): + index = self.cls(TEXTS, self.w2v_model) + self.assertTrue(str(index)) + class TestSoftCosineSimilarity(_TestSimilarityABC): def setUp(self): diff --git a/gensim/test/test_translation_matrix.py b/gensim/test/test_translation_matrix.py index b21fdc6063..2ccd61f597 100644 --- a/gensim/test/test_translation_matrix.py +++ b/gensim/test/test_translation_matrix.py @@ -1,10 +1,12 @@ #!/usr/bin/env python # encoding: utf-8 + from collections import namedtuple import unittest import logging import numpy as np +import pytest from scipy.spatial.distance import cosine from gensim.models.doc2vec import Doc2Vec @@ -60,6 +62,10 @@ def test_translate_nn(self): for idx, item in enumerate(self.test_word_pairs): self.assertTrue(item[1] in translated_words[item[0]]) + @pytest.mark.xfail( + True, + reason='blinking test, can be related to ' + ) def test_translate_gc(self): # Test globally corrected neighbour retrieval method model = translation_matrix.TranslationMatrix(self.source_word_vec, self.target_word_vec, self.word_pairs) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 43505b0be2..7e58275208 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -13,6 +13,8 @@ import os import bz2 import sys +import tempfile +import subprocess import numpy as np @@ -26,6 +28,7 @@ from gensim import utils from gensim.models import word2vec, keyedvectors +from gensim.utils import check_output from gensim.test.utils import ( datapath, get_tmpfile, temporary_file, common_texts as sentences, LeeCorpus, lee_corpus_list, @@ -554,6 +557,12 @@ def test_evaluate_word_analogies(self): """Test that evaluating analogies on KeyedVectors give sane results""" model = word2vec.Word2Vec(LeeCorpus()) score, sections = model.wv.evaluate_word_analogies(datapath('questions-words.txt')) + score_cosmul, sections_cosmul = model.wv.evaluate_word_analogies( + datapath('questions-words.txt'), + similarity_function='most_similar_cosmul' + ) + self.assertEqual(score, score_cosmul) + self.assertEqual(sections, sections_cosmul) self.assertGreaterEqual(score, 0.0) self.assertLessEqual(score, 1.0) self.assertGreater(len(sections), 0) @@ -571,9 +580,9 @@ def test_evaluate_word_pairs(self): pearson = correlation[0][0] spearman = correlation[1][0] oov = correlation[2] - self.assertTrue(0.1 < pearson < 1.0, "pearson {pearson} not between 0.1 & 1.0") - self.assertTrue(0.1 < spearman < 1.0, "spearman {spearman} not between 0.1 and 1.0") - self.assertTrue(0.0 <= oov < 90.0, "OOV {oov} not between 0.0 and 90.0") + self.assertTrue(0.1 < pearson < 1.0, f"pearson {pearson} not between 0.1 & 1.0") + self.assertTrue(0.1 < spearman < 1.0, f"spearman {spearman} not between 0.1 and 1.0") + self.assertTrue(0.0 <= oov < 90.0, f"OOV {oov} not between 0.0 and 90.0") def test_evaluate_word_pairs_from_file(self): """Test Spearman and Pearson correlation coefficients give sane results on similarity datasets""" @@ -833,7 +842,7 @@ def test_parallel(self): # the exact vectors and therefore similarities may differ, due to different thread collisions/randomization # so let's test only for top10 neighbor_rank = [word for word, sim in sims].index(expected_neighbor) - self.assertLess(neighbor_rank, 20) + self.assertLess(neighbor_rank, 5) def test_r_n_g(self): """Test word2vec results identical with identical RNG seed.""" @@ -1040,6 +1049,13 @@ def test_load_on_class_error(self): """Test if exception is raised when loading word2vec model on instance""" self.assertRaises(AttributeError, load_on_instance) + def test_file_should_not_be_compressed(self): + """ + Is corpus_file a compressed file? + """ + with tempfile.NamedTemporaryFile(suffix=".bz2") as fp: + self.assertRaises(TypeError, word2vec.Word2Vec, (None, fp.name)) + def test_reset_from(self): """Test if reset_from() uses pre-built structures from other model""" model = word2vec.Word2Vec(sentences, min_count=1) @@ -1054,6 +1070,15 @@ def test_compute_training_loss(self): training_loss_val = model.get_latest_training_loss() self.assertTrue(training_loss_val > 0.0) + def test_negative_ns_exp(self): + """The model should accept a negative ns_exponent as a valid value.""" + model = word2vec.Word2Vec(sentences, ns_exponent=-1, min_count=1, workers=1) + tmpf = get_tmpfile('w2v_negative_exp.tst') + model.save(tmpf) + loaded_model = word2vec.Word2Vec.load(tmpf) + loaded_model.train(sentences, total_examples=model.corpus_count, epochs=1) + assert loaded_model.ns_exponent == -1, loaded_model.ns_exponent + # endclass TestWord2VecModel @@ -1145,15 +1170,18 @@ def test_path_line_sentences_one_file(self): # endclass TestWord2VecSentenceIterators -# TODO: get correct path to Python binary -# class TestWord2VecScripts(unittest.TestCase): -# def test_word2vec_stand_alone_script(self): -# """Does Word2Vec script launch standalone?""" -# cmd = 'python -m gensim.scripts.word2vec_standalone -train ' + datapath('testcorpus.txt') + \ -# ' -output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3 -min_count 1' -# output = check_output(cmd, stderr=PIPE) -# self.assertEqual(output, '0') -# #endclass TestWord2VecScripts + +class TestWord2VecScripts(unittest.TestCase): + def test_word2vec_stand_alone_script(self): + """Does Word2Vec script launch standalone?""" + cmd = [ + sys.executable, '-m', 'gensim.scripts.word2vec_standalone', + '-train', datapath('testcorpus.txt'), + '-output', 'vec.txt', '-size', '200', '-sample', '1e-4', + '-binary', '0', '-iter', '3', '-min_count', '1', + ] + output = check_output(args=cmd, stderr=subprocess.PIPE) + self.assertEqual(output, b'') if not hasattr(TestWord2VecModel, 'assertLess'): diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 83cbdc6471..2c06185a0b 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -300,18 +300,11 @@ def accumulate(self, texts, window_size): def _iter_texts(self, texts): dtype = np.uint16 if np.iinfo(np.uint16).max >= self._vocab_size else np.uint32 for text in texts: - if self.text_is_relevant(text): - yield np.fromiter(( - self.id2contiguous[self.token2id[w]] if w in self.relevant_words - else self._none_token - for w in text), dtype=dtype, count=len(text)) - - def text_is_relevant(self, text): - """Check if the text has any relevant words.""" - for word in text: - if word in self.relevant_words: - return True - return False + ids = ( + self.id2contiguous[self.token2id[w]] if w in self.relevant_words else self._none_token + for w in text + ) + yield np.fromiter(ids, dtype=dtype, count=len(text)) class InvertedIndexAccumulator(WindowedTextsAnalyzer, InvertedIndexBased): @@ -436,7 +429,7 @@ def __init__(self, processes, *args, **kwargs): self.batch_size = kwargs.get('batch_size', 64) def __str__(self): - return "%s(processes=%s, batch_size=%s)" % ( + return "%s" % ( self.__class__.__name__, self.processes, self.batch_size) def accumulate(self, texts, window_size): diff --git a/gensim/utils.py b/gensim/utils.py index 30b6d85f58..78d64b88e6 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -30,6 +30,7 @@ from copy import deepcopy from datetime import datetime import platform +import types import numpy as np import scipy.sparse @@ -827,7 +828,7 @@ def __init__(self, num_terms): self.num_terms = num_terms def __str__(self): - return "FakeDict(num_terms=%s)" % self.num_terms + return "%s" % (self.__class__.__name__, self.num_terms) def __getitem__(self, val): if 0 <= val < self.num_terms: @@ -2084,3 +2085,19 @@ def effective_n_jobs(n_jobs): elif n_jobs < 0: n_jobs = max(multiprocessing.cpu_count() + 1 + n_jobs, 1) return n_jobs + + +def is_empty(corpus): + """Is the corpus (an iterable or a scipy.sparse array) empty?""" + if scipy.sparse.issparse(corpus): + return corpus.shape[1] == 0 # by convention, scipy.sparse documents are columns + if isinstance(corpus, types.GeneratorType): + return False # don't try to guess emptiness of generators, may lose elements irretrievably + try: + # list, numpy array etc + first_doc = next(iter(corpus)) # noqa: F841 (ignore unused variable) + return False # first document exists => not empty + except StopIteration: + return True + except Exception: + return False diff --git a/jupyter_execute_cell.png b/jupyter_execute_cell.png deleted file mode 100644 index 3005d277b3..0000000000 Binary files a/jupyter_execute_cell.png and /dev/null differ diff --git a/jupyter_home.png b/jupyter_home.png deleted file mode 100644 index 770b1aaee3..0000000000 Binary files a/jupyter_home.png and /dev/null differ diff --git a/release/README.md b/release/README.md index 1ec47db5dc..2614252723 100644 --- a/release/README.md +++ b/release/README.md @@ -1,3 +1,3 @@ Scripts to help when making new releases. -For more info, see [our Wiki page](https://github.com/RaRe-Technologies/gensim/wiki/Developer-page#make-a-new-release-for-maintainers). +For more info, see [our Wiki page](https://github.com/RaRe-Technologies/gensim/wiki/Maintainer-page). diff --git a/release/generate_changelog.py b/release/generate_changelog.py index 97cc306f62..62ca7b329b 100644 --- a/release/generate_changelog.py +++ b/release/generate_changelog.py @@ -10,12 +10,22 @@ import requests import time + +def throttle_get(*args, seconds=10, **kwargs): + result = requests.get(*args, **kwargs) + result.raise_for_status() + + # Avoid Github API throttling; see https://github.com/RaRe-Technologies/gensim/pull/3203#issuecomment-887453109 + time.sleep(seconds) + + return result + + # # The releases get sorted in reverse chronological order, so the first release # in the list is the most recent. # -get = requests.get('https://api.github.com/repos/RaRe-Technologies/gensim/releases') -get.raise_for_status() +get = throttle_get('https://api.github.com/repos/RaRe-Technologies/gensim/releases') most_recent_release = get.json()[0] release_timestamp = most_recent_release['published_at'] @@ -23,11 +33,11 @@ def iter_merged_prs(since=release_timestamp): page = 1 while True: - get = requests.get( + get = throttle_get( 'https://api.github.com/repos/RaRe-Technologies/gensim/pulls', params={'state': 'closed', 'page': page}, ) - get.raise_for_status() + pulls = get.json() if not pulls: break @@ -37,18 +47,15 @@ def iter_merged_prs(since=release_timestamp): yield pr page += 1 - # Avoid Github API throttling; see https://github.com/RaRe-Technologies/gensim/pull/3203#issuecomment-887453109 - time.sleep(1) def iter_closed_issues(since=release_timestamp): page = 1 while True: - get = requests.get( + get = throttle_get( 'https://api.github.com/repos/RaRe-Technologies/gensim/issues', params={'state': 'closed', 'page': page, 'since': since}, ) - get.raise_for_status() issues = get.json() if not issues: break @@ -60,8 +67,6 @@ def iter_closed_issues(since=release_timestamp): if 'pull_request' not in issue and issue['closed_at'] > since: yield issue page += 1 - # Avoid Github API throttling; see https://github.com/RaRe-Technologies/gensim/pull/3203#issuecomment-887453109 - time.sleep(1) fixed_issue_numbers = set() @@ -74,6 +79,12 @@ def iter_closed_issues(since=release_timestamp): # Unfortunately, the GitHub API doesn't link PRs to issues that they fix, # so we have do it ourselves. # + if pr['body'] is None: + # + # Weird edge case, PR with no body + # + continue + for match in re.finditer(r'fix(es)? #(?P\d+)\b', pr['body'], flags=re.IGNORECASE): fixed_issue_numbers.add(int(match.group('number'))) diff --git a/release/hijack_pr.py b/release/hijack_pr.py old mode 100644 new mode 100755 index f885579985..d109836d49 --- a/release/hijack_pr.py +++ b/release/hijack_pr.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python """Hijack a PR to add commits as a maintainer. This is a two-step process: @@ -7,6 +8,16 @@ As a maintainer, you can add changes by making new commits and pushing them back to the remote. + +An example session: + + $ release/hijack_pr.py 1234 + $ git merge upstream/develop # or any other changes you want to make + $ release/hijack_pr.py push + +The above commands would check out the code for the PR, make changes to them, and push them back. +Obviously, this requires the PR to be writable, but most gensim PRs are. +If they aren't, then leave it up to the PR author to make the required changes. """ import json import subprocess @@ -14,6 +25,23 @@ import smart_open +def check_output(command): + return subprocess.check_output(command).strip().decode('utf-8') + + +if sys.argv[1] == "push": + command = "git rev-parse --abbrev-ref HEAD@{upstream}".split() + remote, remote_branch = check_output(command).split('/') + current_branch = check_output(['git', 'branch', '--show-current']) + check_output(['git', 'push', remote, f'{current_branch}:{remote_branch}']) + + # + # Cleanup to prevent remotes and branches from piling up + # + check_output(['git', 'branch', '--delete', current_branch]) + check_output(['git', 'remote', 'remove', remote]) + sys.exit(0) + prid = int(sys.argv[1]) url = f"https://api.github.com/repos/RaRe-Technologies/gensim/pulls/{prid}" with smart_open.open(url) as fin: @@ -22,7 +50,7 @@ user = prinfo['head']['user']['login'] ssh_url = prinfo['head']['repo']['ssh_url'] -remotes = subprocess.check_output(['git', 'remote']).strip().decode('utf-8').split('\n') +remotes = check_output(['git', 'remote']).split('\n') if user not in remotes: subprocess.check_call(['git', 'remote', 'add', user, ssh_url]) @@ -30,4 +58,14 @@ ref = prinfo['head']['ref'] subprocess.check_call(['git', 'checkout', f'{user}/{ref}']) -subprocess.check_call(['git', 'switch', '-c', f'{ref}']) + +# +# Prefix the local branch name with the user to avoid naming clashes with +# existing branches, e.g. develop +# +subprocess.check_call(['git', 'switch', '-c', f'{user}_{ref}']) + +# +# Set the upstream so we can push back to it more easily +# +subprocess.check_call(['git', 'branch', '--set-upstream-to', f'{user}/{ref}']) diff --git a/setup.py b/setup.py index da9c8c23ad..e3ee0c3bdb 100644 --- a/setup.py +++ b/setup.py @@ -10,15 +10,13 @@ python ./setup.py install """ -import distutils.cmd -import distutils.log import itertools import os import platform import shutil import sys -from setuptools import Extension, find_packages, setup +from setuptools import Extension, find_packages, setup, distutils from setuptools.command.build_ext import build_ext c_extensions = { @@ -268,11 +266,10 @@ def run(self): # packages included for build-testing everywhere core_testenv = [ 'pytest', -# 'pytest-rerunfailures', # disabled 2020-08-28 for + 'pytest-cov', 'mock', 'cython', 'testfixtures', - 'Morfessor>=2.0.2a4', ] if not (sys.platform.lower().startswith("win") and sys.version_info[:2] >= (3, 9)): @@ -321,13 +318,12 @@ def run(self): # to build with any sane version of Cython, so we should update this pin # periodically. # -CYTHON_STR = 'Cython==0.29.23' +CYTHON_STR = 'Cython==0.29.28' install_requires = [ NUMPY_STR, 'scipy >= 0.18.1', 'smart_open >= 1.8.1', - "dataclasses; python_version < '3.7'", # pre-py3.7 needs `dataclasses` backport for use of `dataclass` in doc2vec.py ] setup_requires = [NUMPY_STR] @@ -338,7 +334,7 @@ def run(self): setup( name='gensim', - version='4.1.2', + version='4.2.0', description='Python framework for fast Vector Space Modelling', long_description=LONG_DESCRIPTION, @@ -350,6 +346,9 @@ def run(self): author_email='me@radimrehurek.com', url='http://radimrehurek.com/gensim', + project_urls={ + 'Source': 'https://github.com/RaRe-Technologies/gensim', + }, download_url='http://pypi.python.org/pypi/gensim', license='LGPL-2.1-only', diff --git a/tox.ini b/tox.ini deleted file mode 100644 index 12811b8ba5..0000000000 --- a/tox.ini +++ /dev/null @@ -1,133 +0,0 @@ -[tox] -minversion = 2.0 -envlist = {py36,py37,py38, py39}-{win,linux}, flake8, docs, docs-upload, download-wheels, upload-wheels, test-pypi -skipsdist = True -platform = linux: linux - win: win64 - - -[flake8] -ignore = E12, W503 -max-line-length = 120 -show-source = True - - -[flake8-rst] -filename = *.rst *.py -max-line-length = 120 -ignore = E203, # space before : - E402, # module level import not at top of file - # Classes / functions in a docstring block generate those errors - E302, # expected 2 blank lines, found 0 - E305, # expected 2 blank lines after class or function definition, found 0 - F821, # undefined name; remove once all docstrings are fully executable -exclude = .venv, .git, .tox, dist, doc, build, gensim/models/deprecated - - -[pytest] -addopts = -rfxEXs --durations=20 --showlocals - - -[testenv] -recreate = True - -install_command = python -m pip install --timeout=60 {env:TOX_PIP_OPTS:} {opts} {packages} - -deps = - pip>=19.1.1 - linux: .[test] - win: .[test-win] - -setenv = - FT_HOME={env:FT_HOME:} - WR_HOME={env:WR_HOME:} - VOWPAL_WABBIT_PATH={env:VOWPAL_WABBIT_PATH:} - DTM_PATH={env:DTM_PATH:} - MALLET_HOME={env:MALLET_HOME:} - SKIP_NETWORK_TESTS={env:SKIP_NETWORK_TESTS:} - BOTO_CONFIG={env:BOTO_CONFIG:} - PIPELINE_WORKSPACE={env:PIPELINE_WORKSPACE:} - PYTHONHASHSEED=1 - TOX_PARALLEL_NO_SPINNER=1 - -commands = - python --version - pip --version - python setup.py build_ext --inplace - pytest {posargs:gensim/test} - - -[testenv:flake8] -recreate = True -deps = - # Pinned to 3.7.9 because >3.8.0 triggers "AttributeError: 'Namespace' object has no attribute 'output_file'" - # in flake8-rst. Apparently some bug in flake8-rst: - # https://gitlab.com/pycqa/flake8/-/issues/641 - # https://github.com/kataev/flake8-rst/pull/23/files - flake8==3.7.9 - -commands = flake8 gensim/ {posargs} - - -[testenv:flake8-docs] -recreate = True -deps = - flake8-rst==0.7.2 - flake8==3.7.9 - -commands = flake8-rst gensim/ docs/ {posargs} - - -[testenv:compile] -basepython = python3 -recreate = True - -deps = numpy -commands = python setup.py build_ext --inplace - - -[testenv:docs] -basepython = python3 -recreate = True -whitelist_externals = make -deps = .[docs] - -commands = - python setup.py build_ext --inplace - make -C docs/src clean html - - -[testenv:docs-upload] -recreate = True -whitelist_externals = make -deps = .[docs] -changedir = docs/src - -commands = make clean html upload - - -[testenv:download-wheels] -deps = wheelhouse_uploader -whitelist_externals = rm -recreate = True - -commands = - rm -rf dist/ - python setup.py sdist fetch_artifacts - - -[testenv:upload-wheels] -deps = twine - -commands = twine upload dist/* - - -[testenv:test-pypi] -deps = twine -whitelist_externals = rm - -commands = - rm -rf dist/ - python setup.py sdist - twine upload --repository-url https://test.pypi.org/legacy/ dist/* - ; Go to https://testpypi.python.org/pypi?name=gensim&:action=display and check result