diff --git a/README.md b/README.md index b5a05f5..d1682b7 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ An Open Source Japanese NLP Library, based on Universal Dependencies -***Please read the [Important changes](#ginza-510) before you upgrade GiNZA.*** +***Please read the [Important changes](#ginza-520) before you upgrade GiNZA.*** [日本語ページはこちら](https://megagonlabs.github.io/ginza/) @@ -67,7 +67,7 @@ Contains information from mC4 which is made available under the ODC Attribution ``` ## Runtime Environment -This project is developed with Python>=3.6 and pip for it. +This project is developed with Python>=3.8 and pip for it. We do not recommend to use Anaconda environment because the pip install step may not work properly. Please also see the Development Environment section below. @@ -120,15 +120,15 @@ After pressing enter key, you will get the parsed results with [CoNLL-U Syntacti $ ginza 銀座でランチをご一緒しましょう。 # text = 銀座でランチをご一緒しましょう。 -1 銀座 銀座 PROPN 名詞-固有名詞-地名-一般 _ 6 obl _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|NP_B|Reading=ギンザ|NE=B-GPE|ENE=B-City -2 で で ADP 助詞-格助詞 _ 1 case _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Reading=デ -3 ランチ ランチ NOUN 名詞-普通名詞-一般 _ 6 obj _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|NP_B|Reading=ランチ -4 を を ADP 助詞-格助詞 _ 3 case _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Reading=ヲ -5 ご ご NOUN 接頭辞 _ 6 compound _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=CONT|Reading=ゴ -6 一緒 一緒 VERB 名詞-普通名詞-サ変可能 _ 0 root _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=ROOT|Reading=イッショ -7 し する AUX 動詞-非自立可能 _ 6 aux _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Inf=サ行変格,連用形-一般|Reading=シ -8 ましょう ます AUX 助動詞 _ 6 aux _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Inf=助動詞-マス,意志推量形|Reading=マショウ -9 。 。 PUNCT 補助記号-句点 _ 6 punct _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=CONT|Reading=。 +1 銀座 銀座 PROPN 名詞-固有名詞-地名-一般 _ 6 nmod _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|NP_B|Reading=ギンザ|NE=B-GPE|ENE=B-City|ClauseHead=6 +2 で で ADP 助詞-格助詞 _ 1 case _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Reading=デ|ClauseHead=6 +3 ランチ ランチ NOUN 名詞-普通名詞-一般 _ 6 obj _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|NP_B|Reading=ランチ|ClauseHead=6 +4 を を ADP 助詞-格助詞 _ 3 case _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Reading=ヲ|ClauseHead=6 +5 ご ご NOUN 接頭辞 _ 6 compound _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=CONT|NP_B|Reading=ゴ|ClauseHead=6 +6 一緒 一緒 NOUN 名詞-普通名詞-サ変可能 _ 0 root _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=ROOT|NP_I|Reading=イッショ|ClauseHead=6 +7 し する AUX 動詞-非自立可能 _ 6 aux _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Inf=サ行変格,連用形-一般|Reading=シ|ClauseHead=6 +8 ましょう ます AUX 助動詞 _ 6 aux _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Inf=助動詞-マス,意志推量形|Reading=マショウ|ClauseHead=6 +9 。 。 PUNCT 補助記号-句点 _ 6 punct _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=CONT|Reading=。|ClauseHead=6 ``` `ginzame` command provides tokenization function like [MeCab](https://taku910.github.io/mecab/). @@ -239,6 +239,13 @@ Please read the official documents to compile user dictionaries with `sudachipy` ### version 5.x +#### ginza-5.2.0 +- 2024-03-31 +- Require python>=3.8 +- Migrate to spaCy v3.7 +- New functionality + - add Japanese clause recognition API (experimental) + #### ginza-5.1.3 - 2023-09-25 - Migrate to spaCy v3.6 diff --git a/config/ja_ginza.meta.json b/config/ja_ginza.meta.json index 8266fc0..1864604 100644 --- a/config/ja_ginza.meta.json +++ b/config/ja_ginza.meta.json @@ -1,7 +1,7 @@ { "lang":"ja", "name":"ginza", - "version":"5.1.3", + "version":"5.2.0", "description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019). Assigns word2vec token vectors. Components: tok2vec, parser, ner, morphologizer, atteribute_ruler, compound_splitter, bunsetu_recognizer.", "author":"Megagon Labs Tokyo.", "email":"ginza@megagon.ai", @@ -34,7 +34,7 @@ } ], "parent_package":"spacy", - "spacy_version":">=3.2.0,<3.7.0", + "spacy_version":">=3.4.4,<4.0.0", "spacy_git_version":"0fc3dee77", "vectors":{ "width":300, @@ -66,6 +66,6 @@ "requirements":[ "sudachipy>=0.6.2,<0.7.0", "sudachidict_core>=20210802", - "ginza>=5.1.0,<5.2.0" + "ginza>=5.2.0,<5.3.0" ] } diff --git a/config/ja_ginza_bert_large.meta.json b/config/ja_ginza_bert_large.meta.json index 4777627..d7973b8 100644 --- a/config/ja_ginza_bert_large.meta.json +++ b/config/ja_ginza_bert_large.meta.json @@ -1,7 +1,7 @@ { "lang":"ja", "name":"ginza_bert_large", - "version":"5.1.3b1", + "version":"5.2.0b1", "description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019) + transformers-ud-japanese-electra--base. Components: transformer, parser, atteribute_ruler, ner, morphologizer, compound_splitter, bunsetu_recognizer.", "author":"Megagon Labs Tokyo.", "email":"ginza@megagon.ai", @@ -33,7 +33,7 @@ "author":"Tohoku University" } ], - "spacy_version":">=3.6.1,<3.7.0", + "spacy_version":">=3.6.1,<4.0.0", "spacy_git_version":"458bc5f45", "pipeline":[ "transformer", @@ -65,10 +65,9 @@ "requirements":[ "sudachipy>=0.6.7,<0.7.0", "sudachidict_core>=20230711", - "spacy>=3.6.1,<3.7.0", "spacy-transformers>=1.2.5,<1.3.0", "fugashi>=1.3.0", "unidic-lite>=1.0.8", - "ginza>=5.1.3,<5.2.0" + "ginza>=5.2.0,<5.3.0" ] } \ No newline at end of file diff --git a/config/ja_ginza_electra.meta.json b/config/ja_ginza_electra.meta.json index c800715..adcd30d 100644 --- a/config/ja_ginza_electra.meta.json +++ b/config/ja_ginza_electra.meta.json @@ -1,7 +1,7 @@ { "lang":"ja", "name":"ginza_electra", - "version":"5.1.3", + "version":"5.2.0", "description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019) + transformers-ud-japanese-electra--base. Components: transformer, parser, atteribute_ruler, ner, morphologizer, compound_splitter, bunsetu_recognizer.", "author":"Megagon Labs Tokyo.", "email":"ginza@megagon.ai", @@ -41,7 +41,7 @@ } ], "parent_package":"spacy", - "spacy_version":">=3.2.0,<3.7.0", + "spacy_version":">=3.4.4,<4.0.0", "spacy_git_version":"0fc3dee77", "pipeline":[ "transformer", @@ -75,7 +75,6 @@ "sudachidict_core>=20210802", "sudachitra>=0.1.6,<0.2.0", "ginza-transformers>=0.4.0,<0.5.0", - "ginza>=5.1.0,<5.2.0", - "spacy-transformers>=1.1.2,<1.2.0" + "ginza>=5.2.0,<5.3.0" ] } diff --git a/docs/bunsetu_api.md b/docs/bunsetu_api.md index 39fd699..4d4189e 100644 --- a/docs/bunsetu_api.md +++ b/docs/bunsetu_api.md @@ -41,7 +41,7 @@ for frame, count in sorted(frames.items(), key=lambda t: -t[1]): print(count, *frame, sep="\t") # 出現頻度の高い順に表示 ``` -#### 表1 GiNZA v4で追加された文節APIの一覧 +#### 表1 GiNZAの文節APIの一覧 | category | func or variable | description | | --- | --- | --- | @@ -72,6 +72,10 @@ for frame, count in sorted(frames.items(), key=lambda t: -t[1]): | Subtoken | | | | | sub_tokens() | トークンの分割情報。 | | | set_split_mode() | デフォルトの分割モードの変更。 | +| Clause | | | +| | clauses() | 節単位に分割されたトークン列。(experimental) | +| | clause_head() | トークンが属する節のヘッドとなるトークン。(experimental) | +| | clause_head_i() | トークンが属する節のヘッドとなるトークン番号。(experimental) | ## 解説資料 diff --git a/docs/command_line_tool.md b/docs/command_line_tool.md index b018eaf..4d7fcc6 100644 --- a/docs/command_line_tool.md +++ b/docs/command_line_tool.md @@ -7,15 +7,15 @@ $ ginza 銀座でランチをご一緒しましょう。 # text = 銀座でランチをご一緒しましょう。 -1 銀座 銀座 PROPN 名詞-固有名詞-地名-一般 _ 6 obl _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|NP_B|Reading=ギンザ|NE=B-GPE|ENE=B-City -2 で で ADP 助詞-格助詞 _ 1 case _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Reading=デ -3 ランチ ランチ NOUN 名詞-普通名詞-一般 _ 6 obj _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|NP_B|Reading=ランチ -4 を を ADP 助詞-格助詞 _ 3 case _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Reading=ヲ -5 ご ご NOUN 接頭辞 _ 6 compound _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=CONT|Reading=ゴ -6 一緒 一緒 VERB 名詞-普通名詞-サ変可能 _ 0 root _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=ROOT|Reading=イッショ -7 し する AUX 動詞-非自立可能 _ 6 aux _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Inf=サ行変格,連用形-一般|Reading=シ -8 ましょう ます AUX 助動詞 _ 6 aux _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Inf=助動詞-マス,意志推量形|Reading=マショウ -9 。 。 PUNCT 補助記号-句点 _ 6 punct _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=CONT|Reading=。 +1 銀座 銀座 PROPN 名詞-固有名詞-地名-一般 _ 6 nmod _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|NP_B|Reading=ギンザ|NE=B-GPE|ENE=B-City|ClauseHead=6 +2 で で ADP 助詞-格助詞 _ 1 case _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Reading=デ|ClauseHead=6 +3 ランチ ランチ NOUN 名詞-普通名詞-一般 _ 6 obj _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|NP_B|Reading=ランチ|ClauseHead=6 +4 を を ADP 助詞-格助詞 _ 3 case _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Reading=ヲ|ClauseHead=6 +5 ご ご NOUN 接頭辞 _ 6 compound _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=CONT|NP_B|Reading=ゴ|ClauseHead=6 +6 一緒 一緒 NOUN 名詞-普通名詞-サ変可能 _ 0 root _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=ROOT|NP_I|Reading=イッショ|ClauseHead=6 +7 し する AUX 動詞-非自立可能 _ 6 aux _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Inf=サ行変格,連用形-一般|Reading=シ|ClauseHead=6 +8 ましょう ます AUX 助動詞 _ 6 aux _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Inf=助動詞-マス,意志推量形|Reading=マショウ|ClauseHead=6 +9 。 。 PUNCT 補助記号-句点 _ 6 punct _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=CONT|Reading=。|ClauseHead=6 ``` diff --git a/docs/index.md b/docs/index.md index dbb42c5..06cdf33 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,16 +8,12 @@ ## What's new! -- [`ja_ginza_bert_large`のβ版を公開中](https://github.com/megagonlabs/ginza/releases/tag/v5.1.3) +- `GiNZA v5.2.0`をリリースしました (2024.03.31) + - 日本語の節認定のためのAPIを追加 (experimental) +- [`ja_ginza_bert_large`のβ版を公開中](https://github.com/megagonlabs/ginza/releases/tag/v5.2.0) - [cl-tohoku/bert-large-japanese-v2](https://huggingface.co/cl-tohoku/bert-large-japanese-v2)をベースモデルに採用 - 精度が大幅に向上(LAS=0.938, UAS=0.949, UPOS=0.983, ENE=0.708) - CUDAに対応し8GB以上のRAMを搭載したGPU環境、または、M1・M2などApple Silicon環境の利用を推奨 -- `GiNZA v5.1.3` - - `spaCy v3.2 ~ v3.6`に対応 -- `GiNZA v5.1` - - `spaCy v3.2`と`Sudachi.rs(SudachiPy v0.6.2)`に対応 - - バッチ解析処理をGPU環境で50〜60%・CPU環境で10〜40%高速化 - - ginzaコマンドの並列実行オプション(`ginza -p {n_process}`および`ginzame`)の処理効率を向上 - ginzaコマンドで日本語以外を含む全てのspaCyモデルが利用可能に - `ginza -m en_core_web_md` の形でモデル名を指定することで[CoNLL-U](https://universaldependencies.org/format.html#syntactic-annotation)出力ツールとして利用可能 - [ginzaコマンドの解説ページ](https://megagonlabs.github.io/ginza/command_line_tool.html)の記述を拡充 @@ -25,6 +21,11 @@ ***GiNZAをアップグレードする際は下記の互換性情報を確認してください。*** +## GiNZA v5.2 互換性情報 +- Pythonの対応バージョンが3.8以上に変更されました +- spaCyの対応バージョンがv3.4.4に変更されました +- コマンドラインのconllu出力のmisc列にClauseHeadフィールドが追加されました + ## GiNZA v5.1 互換性情報 - `ginza --require_gpu`および`ginza -g`オプションが引数にgpu_idを取るようになりました - gpu_idに-1を指定(デフォルト)するとCPUのみを使用します @@ -43,6 +44,24 @@ ## GiNZA v5 新機能 +### 日本語の節認定API (experimental) + +GiNZA v5.2.0で日本語の節認定機能(試用版)を実装しました。 + +`ginza`コマンドの実行結果のconllu出力のmisc列を拡張して、各トークンが属する節のヘッドのトークン番号を`ClauseHead`フィールドで示しています。 + +APIには次の関数を追加しました。 +- `clauses(doc)` + - 節単位に分割されたトークン列の取得 +- `clause_head(token)` + - トークンが属する節のヘッドとなるトークンの取得 +- `clause_head_i(token)` + - トークンが属する節のヘッドとなるトークン番号の取得 + +現在の節認定の実装は次のような簡易なもので、今後さらに改良を行う予定です。 +- 文に含まれる読点を節区切りの候補とする +- さらに読点で区切られた節が2文節以上で構成される場合のみ節として認定する + ### Transformersモデルによる解析精度の向上 GiNZA v5の解析精度は以前のバージョンから飛躍的な向上を遂げました。精度向上の主たる貢献はTransformers事前学習モデルの導入にあります。次の図は、UD_Japanese-BCCWJ r2.8における、従来型モデルの`ja_ginza`と、Transformers事前学習モデルを用いた`ja_ginza_electra`の、依存関係ラベリングおよび単語依存構造解析の学習曲線です。 @@ -69,7 +88,7 @@ GiNZA v5の解析精度は以前のバージョンから飛躍的な向上を遂 ## 実行環境 -GiNZAは Python 3.6以上(および対応するpip)で動作検証を行っています。 +GiNZAは Python 3.8以上(および対応するpip)で動作検証を行っています。 GiNZAをインストールする前に予めPython実行環境を構築してください。 ### 実行環境のセットアップ @@ -137,15 +156,15 @@ $ pip install torch thinc-apple-ops $ ginza 銀座でランチをご一緒しましょう。 # text = 銀座でランチをご一緒しましょう。 -1 銀座 銀座 PROPN 名詞-固有名詞-地名-一般 _ 6 obl _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|NP_B|Reading=ギンザ|NE=B-GPE|ENE=B-City -2 で で ADP 助詞-格助詞 _ 1 case _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Reading=デ -3 ランチ ランチ NOUN 名詞-普通名詞-一般 _ 6 obj _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|NP_B|Reading=ランチ -4 を を ADP 助詞-格助詞 _ 3 case _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Reading=ヲ -5 ご ご NOUN 接頭辞 _ 6 compound _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=CONT|Reading=ゴ -6 一緒 一緒 VERB 名詞-普通名詞-サ変可能 _ 0 root _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=ROOT|Reading=イッショ -7 し する AUX 動詞-非自立可能 _ 6 aux _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Inf=サ行変格,連用形-一般|Reading=シ -8 ましょう ます AUX 助動詞 _ 6 aux _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Inf=助動詞-マス,意志推量形|Reading=マショウ -9 。 。 PUNCT 補助記号-句点 _ 6 punct _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=CONT|Reading=。 +1 銀座 銀座 PROPN 名詞-固有名詞-地名-一般 _ 6 nmod _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|NP_B|Reading=ギンザ|NE=B-GPE|ENE=B-City|ClauseHead=6 +2 で で ADP 助詞-格助詞 _ 1 case _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Reading=デ|ClauseHead=6 +3 ランチ ランチ NOUN 名詞-普通名詞-一般 _ 6 obj _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|NP_B|Reading=ランチ|ClauseHead=6 +4 を を ADP 助詞-格助詞 _ 3 case _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Reading=ヲ|ClauseHead=6 +5 ご ご NOUN 接頭辞 _ 6 compound _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=CONT|NP_B|Reading=ゴ|ClauseHead=6 +6 一緒 一緒 NOUN 名詞-普通名詞-サ変可能 _ 0 root _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=ROOT|NP_I|Reading=イッショ|ClauseHead=6 +7 し する AUX 動詞-非自立可能 _ 6 aux _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Inf=サ行変格,連用形-一般|Reading=シ|ClauseHead=6 +8 ましょう ます AUX 助動詞 _ 6 aux _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Inf=助動詞-マス,意志推量形|Reading=マショウ|ClauseHead=6 +9 。 。 PUNCT 補助記号-句点 _ 6 punct _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=CONT|Reading=。|ClauseHead=6 ``` 実行環境に`ja_ginza_electra`と`ja_ginza`の両方のモデルがインストールされている場合、`ginza`コマンドは`ja_ginza_electra`を優先して使用します。同じ状況で`ja_ginza`を使用するには`ginza -m`オプションでモデル名を指定します。 @@ -273,6 +292,13 @@ Contains information from mC4 which is made available under the ODC Attribution ### version 5.x +#### ginza-5.2.0 +- 2024-03-31 +- Require python>=3.8 +- Migrate to spaCy v3.7 +- New functionality + - add Japanese clause recognition API (experimental) + #### ginza-5.1.3 - 2023-09-25 - Migrate to spaCy v3.6 diff --git a/ginza/__init__.py b/ginza/__init__.py index 3738242..4e415f3 100644 --- a/ginza/__init__.py +++ b/ginza/__init__.py @@ -24,6 +24,7 @@ "ent_label_ene", "ent_label_ontonotes", "reading_form", "inflection", "bunsetu_bi_label", "bunsetu_position_type", "is_bunsetu_head", + "clauses","token_clause_head", "SEP", "default_join_func", "traverse", "head", "ancestors", "conjuncts", "children", "lefts", "rights", "subtree", @@ -38,6 +39,9 @@ "bunsetu_head_tokens", "bunsetu_bi_labels", "bunsetu_position_types", + "clauses", + "clause_head", + "clause_head_i", "BunsetuRecognizer", # from compound_splitter "CompoundSplitter", diff --git a/ginza/analyzer.py b/ginza/analyzer.py index f25931e..4219159 100644 --- a/ginza/analyzer.py +++ b/ginza/analyzer.py @@ -9,7 +9,7 @@ from spacy.language import Language from spacy.lang.ja import Japanese -from . import set_split_mode, inflection, reading_form, ent_label_ene, ent_label_ontonotes, bunsetu_bi_label, bunsetu_position_type +from . import set_split_mode, inflection, reading_form, ent_label_ene, ent_label_ontonotes, bunsetu_bi_label, bunsetu_position_type, clause_head_i from .bunsetu_recognizer import bunsetu_available, bunsetu_head_list, bunsetu_phrase_span @@ -77,7 +77,10 @@ def set_nlp(self) -> None: try: nlp = spacy.load("ja_ginza") except IOError as e: - raise OSError("E050", 'You need to install "ja-ginza" or "ja-ginza-electra" by executing `pip install ja-ginza` or `pip install ja-ginza-electra`.') + try: + nlp = spacy.load("ja_ginza_bert_large") + except IOError as e: + raise OSError("E050", 'You need to install "ja-ginza" or "ja-ginza-electra" by executing `pip install ja-ginza` or `pip install ja-ginza-electra`.') if self.disable_sentencizer: nlp.add_pipe("disable_sentencizer", before="parser") @@ -210,18 +213,20 @@ def conllu_token_line(sent, token, np_label, use_bunsetu, use_normalized_form, u reading = reading_form(token, use_orth_if_reading_is_none) ne = ent_label_ontonotes(token) ene = ent_label_ene(token) + clause_head = clause_head_i(token) + 1 misc = "|".join( filter( lambda s: s, ( "SpaceAfter=Yes" if token.whitespace_ else "SpaceAfter=No", - "" if not bunsetu_bi else "BunsetuBILabel={}".format(bunsetu_bi), - "" if not position_type else "BunsetuPositionType={}".format(position_type), + "" if not bunsetu_bi else f"BunsetuBILabel={bunsetu_bi}", + "" if not position_type else f"BunsetuPositionType={position_type}", np_label, - "" if not inf else "Inf={}".format(inf), + "" if not inf else f"Inf={inf}", "" if not reading else "Reading={}".format(reading.replace("|", "\\|").replace("\\", "\\\\")), - "" if not ne or ne == "O" else "NE={}".format(ne), - "" if not ene or ene == "O" else "ENE={}".format(ene), + "" if not ne or ne == "O" else f"NE={ne}", + "" if not ene or ene == "O" else f"ENE={ene}", + "" if not clause_head else f"ClauseHead={clause_head}", ) ) ) diff --git a/ginza/bunsetu_recognizer.py b/ginza/bunsetu_recognizer.py index f28405f..2863340 100644 --- a/ginza/bunsetu_recognizer.py +++ b/ginza/bunsetu_recognizer.py @@ -1,4 +1,5 @@ -from typing import Iterable, List +import re +from typing import Dict, Iterable, List, Optional, Set from spacy.language import Language from spacy.tokens import Doc, Span, Token @@ -18,6 +19,11 @@ "BUNSETU_HEAD_SUFFIX", "PHRASE_RELATIONS", "POS_PHRASE_MAP", + "clauses", + "clause_head", + "clause_head_i", + "CLAUSE_MARKER_RULES", + "MIN_BUNSETU_NUM_IN_CLAUSE", ] @@ -40,6 +46,14 @@ "CCONJ": "CCONJP", } +CLAUSE_MARKER_RULES = [ + { + "tag_": "補助記号-読点", + }, +] + +MIN_BUNSETU_NUM_IN_CLAUSE = 2 + def bunsetu_available(span: Span): return "bunsetu_heads" in span.doc.user_data @@ -139,10 +153,32 @@ def bunsetu_position_types(span: Span) -> List[str]: return position_types[start:end] +def clauses(doc: Doc) -> List[Token]: + clauses = doc.user_data["clauses"] + return [[doc[token] for token in tokens] for tokens in clauses.values()] + + +def clause_head(token: Token) -> Token: + return token.doc[token.doc.user_data["clause_heads"][token.i]] + + +def clause_head_i(token: Token) -> int: + doc = token.doc + return doc.user_data["clause_heads"][token.i] - token.sent.start + + class BunsetuRecognizer: - def __init__(self, nlp: Language, remain_bunsetu_suffix: bool = False) -> None: + def __init__( + self, + nlp: Language, + remain_bunsetu_suffix: bool = False, + clause_marker_rules: List[Dict[str, str]] = CLAUSE_MARKER_RULES, + min_bunsetu_num_in_clause: int = MIN_BUNSETU_NUM_IN_CLAUSE, + ) -> None: self.nlp = nlp self._remain_bunsetu_suffix = remain_bunsetu_suffix + self._clause_marker_rules = [{k: re.compile(v) for k, v in rule.items()} for rule in clause_marker_rules] + self._min_bunsetu_num_in_clause = min_bunsetu_num_in_clause @property def remain_bunsetu_suffix(self) -> str: @@ -152,6 +188,22 @@ def remain_bunsetu_suffix(self) -> str: def remain_bunsetu_suffix(self, remain: bool): self._remain_bunsetu_suffix = remain + @property + def clause_marker_rules(self) -> List[Dict[str, str]]: + return [{k: v.pattern for k, v in rules.items()} for rules in self._clause_marker_rules] + + @clause_marker_rules.setter + def clause_marker_rules(self, _clause_marker_rules: List[Dict[str, str]]): + self._clause_markers = [{k: re.compile(v) for k, v in rules} for rules in _clause_marker_rules] + + @property + def min_bunsetu_num_in_clause(self) -> int: + return self._min_bunsetu_num_in_clause + + @min_bunsetu_num_in_clause.setter + def min_bunsetu_num_in_clause(self, _min_bunsetu_num_in_clause: int): + self._min_bunsetu_num_in_clause = _min_bunsetu_num_in_clause + def __call__(self, doc: Doc) -> Doc: debug = False heads = [False] * len(doc) @@ -244,6 +296,69 @@ def __call__(self, doc: Doc) -> Doc: else: position_types[t.i] = "CONT" doc.user_data["bunsetu_position_types"] = position_types + + bunsetu_heads_set = set(bunsetu_heads) + clause_head_candidates = set() + roots = set() + for t in doc: + for rule in self._clause_marker_rules: + if t.dep_.lower() == "root": + roots.add(t.i) + continue + for attr, pattern in rule.items(): + if not pattern.fullmatch(getattr(t, attr)): + break + else: + if t.i in bunsetu_heads_set: + clause_head_candidates.add(t.i) + else: + for ancestor in t.ancestors: + if ancestor.i in bunsetu_heads_set: + clause_head_candidates.add(t.head.i) + break + break + clause_head_candidates -= roots + + for clause_head in list(sorted(clause_head_candidates)): + subtree = set(_.i for _ in doc[clause_head].subtree) + if len(subtree & bunsetu_heads_set) < self._min_bunsetu_num_in_clause: + clause_head_candidates.remove(clause_head) + + clause_head_candidates |= roots + for clause_head in list(sorted(clause_head_candidates)): + subtree = set(_.i for _ in doc[clause_head].subtree) + subtree_bunsetu = subtree & bunsetu_heads_set + descendant_clauses = subtree & clause_head_candidates - {clause_head} + for subclause in descendant_clauses: + subtree_bunsetu -= set(_.i for _ in doc[subclause].subtree) + if len(subtree_bunsetu) < self._min_bunsetu_num_in_clause: + if clause_head in roots: + clause_head_candidates -= descendant_clauses + else: + clause_head_candidates.remove(clause_head) + + clause_heads = list(sorted(clause_head_candidates)) + + def _children_except_clause_heads(idx): + children = [] + for t in doc[idx].lefts: + if t.i in clause_heads: + continue + children += _children_except_clause_heads(t.i) + children.append(idx) + for t in doc[idx].rights: + if t.i in clause_heads: + continue + children += _children_except_clause_heads(t.i) + return children + + clauses = {head: _children_except_clause_heads(head) for head in clause_heads} + doc.user_data["clauses"] = clauses + clause_heads = [-1] * len(doc) + for head, tokens in clauses.items(): + for token in tokens: + clause_heads[token] = head + doc.user_data["clause_heads"] = clause_heads return doc diff --git a/requirements.txt b/requirements.txt index 9e8a3d9..f5c3af3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -spacy>=3.2.0,<3.7.0 +spacy>=3.4.4,<4.0.0 plac>=1.3.3 SudachiPy>=0.6.2,<0.7.0 SudachiDict-core>=20210802 diff --git a/setup.py b/setup.py index d8caa3d..4247db2 100644 --- a/setup.py +++ b/setup.py @@ -16,9 +16,9 @@ "ginzame = ginza.command_line:main_ginzame", ], }, - python_requires=">=3.6", + python_requires=">=3.8", install_requires=[ - "spacy>=3.2.0,<3.7.0", + "spacy>=3.4.4,<4.0.0", "plac>=1.3.3", "SudachiPy>=0.6.2,<0.7.0", "SudachiDict-core>=20210802", @@ -29,5 +29,5 @@ name="ginza", packages=find_packages(include=["ginza"]), url="https://github.com/megagonlabs/ginza", - version='5.1.3', + version='5.2.0', )