From 28ab09b2f301f9e5425ca1d007ec923642db50bf Mon Sep 17 00:00:00 2001 From: Anne Schumacher Date: Mon, 11 Nov 2024 16:25:59 +0100 Subject: [PATCH] more analysis metadata --- sparv/modules/geo/metadata.yaml | 118 +++++++++++++ sparv/modules/hunpos/metadata.yaml | 211 ++++++++++++++++++++++++ sparv/modules/readability/metadata.yaml | 4 +- 3 files changed, 331 insertions(+), 2 deletions(-) create mode 100644 sparv/modules/geo/metadata.yaml create mode 100644 sparv/modules/hunpos/metadata.yaml diff --git a/sparv/modules/geo/metadata.yaml b/sparv/modules/geo/metadata.yaml new file mode 100644 index 00000000..afdfd8d2 --- /dev/null +++ b/sparv/modules/geo/metadata.yaml @@ -0,0 +1,118 @@ +id: geo-parent +abstract: true +task: geotagging +language_codes: + - swe +keywords: + - geotagging +standard_reference: '' +other_references: [] +tool: '' +model: "[GeoNames](https://www.geonames.org/)" +trained_on: '' +tagset: '' +evaluation_results: '' +created: 2018-05-28 +updated: 2022-05-18 +--- +id: swe-geotagcontext-sparv +parent: geo-parent +name: + swe: Geotaggning av platsnamn från kontext + eng: Geotagging of place names from context +short_description: + swe: Annotering av texter med platsinformation, baserad på platser som finns i texten + eng: Annotate text chunks with location data, based on locations contained within the text +annotations: + - :geo.geo_context + - :geo.geo_context + - :geo.geo_context +example_output: |- + ```xml + + + + Varje + tisdag + kommer + en + leverans + av + lådor + med + matsvinn + från + Ica + Maxi + i + Torslanda + till + förskolan + i + Göteborg + . + + + + ``` +description: + swe: |- + Texter berikas med platsnamn (och deras geografiska koordinater) som finns i dem. Detta är baserat på platsnamn som + hittats genom namnigenkänning med [SweNer](https://spraakbanken.gu.se/analyser/swe-namedentity-swener). Geografiska + koordinater letas upp i [GeoNames-databasen](https://www.geonames.org/). Denna annotation kan användas på valfria + textspann såsom text, stycke, mening eller token. + eng: |- + Text chunks are enriched with place names (and their geographic coordinates) occurring within them. This is based on + the place names found by the named entity tagger + [SweNer](https://spraakbanken.gu.se/en/analyses/swe-namedentity-swener). Geographical coordinates are looked up in + the [GeoNames database](https://www.geonames.org/). This annotation can be applied to any text chunk, e.g. texts, + paragraphs, sentences or tokens. +--- +id: swe-geotagmetadata-sparv +parent: geo-parent +name: + swe: Geotagging av platsnamn från metadata + eng: Geotagging of place names from metadata +short_description: + swe: Annotering av texter med platsinformation, baserad på platser som finns i texten + eng: Annotate text chunks with location data, based on metadata containing location names +annotations: + - :geo.geo_metadata +example_output: |- + ```xml + + Det + var + + änna + bösigt + i + bamban + ! + + ``` +example_extra: |- + In order to use this annotation you need to tell Sparv where to look for the geographic metadata. If, for example, + your corpus looks like this: + ```xml + Det var då änna bösigt i bamban! + ``` + and you would like to use `author_location` as input for your annotation you need to add the following setting to your + Sparv corpus configuration file: + ```yaml + geo: + metadata_source: text:author_location + ``` +description: + swe: |- + Texter berikas med platsnamn (och deras geografiska koordinater) som finns i dess metadata. Detta är baserat på + platsnamn som hittats genom namnigenkänning med + [SweNer](https://spraakbanken.gu.se/analyser/swe-namedentity-swener). Geografiska koordinater letas upp i + [GeoNames-databasen](https://www.geonames.org/). Denna annotation kan användas på valfria textspann och valfria + attribut som innehåller platsnamn. + eng: |- + Text chunks are enriched with place names (and their geographic coordinates) occurring within them. This is based on + the place names found by the named entity tagger + [SweNer](https://spraakbanken.gu.se/en/analyses/swe-namedentity-swener). Geographical coordinates are looked up in + the [GeoNames database](https://www.geonames.org/). This annotation can be applied to any text chunk and any + attribute containing place names. diff --git a/sparv/modules/hunpos/metadata.yaml b/sparv/modules/hunpos/metadata.yaml new file mode 100644 index 00000000..9c665e8a --- /dev/null +++ b/sparv/modules/hunpos/metadata.yaml @@ -0,0 +1,211 @@ +id: hunpos-parent +abstract: true +language_codes: + - swe +standard_reference: '' +other_references: + - "Hunpos: https://code.google.com/archive/p/hunpos/" +tool: "Hunpos" +trained_on: "[SUC3](https://spraakbanken.gu.se/resurser/suc3)" +tagset: "[SUC3](https://spraakbanken.gu.se/korp/markup/msdtags.html)" +evaluation_results: '' +--- +id: swe-pos-hunpos-suc3 +parent: hunpos-parent +name: + swe: SUC-ordklasstaggning med Hunpos + eng: SUC part-of-speech tagging with Hunpos +short_description: + swe: Annotering av SUC-ordklasser med Hunpos för svenska + eng: Swedish part-of-speech annotation with SUC tags by Hunpos +task: part-of-speech tagging +keywords: + - pos-tagging +annotations: + - :hunpos.pos +example_output: |- + ```xml + Det + här + är + en + korpus + . + ``` +model: "[suc3_suc-tags_default-setting_utf8.model](https://github.com/spraakbanken/sparv-models/blob/master/hunpos/suc3_suc-tags_default-setting_utf8.model?raw=true)" +description: + swe: |- + Meningssegment analyseras och annoteras med ordklasstaggar. Ingår inte längre i + Sparvs standardanalyser eftersom Stanzas ordklassannotering ger bättre resultat. + eng: |- + Sentence segments are analysed to enrich tokens with part-of-speech tags. No longer + used by default by Sparv because Stanza's POS-tagging yields better results. +created: 2010-12-15 +updated: 2018-05-28 +--- +id: swe-msd-hunpos-suc3 +parent: hunpos-parent +name: + swe: Morfosyntaktisk SUC-taggning med Hunpos + eng: Tagging of morphological features (SUC) by Hunpos +short_description: + swe: Annotering av morfosyntaktiska deskriptorer (SUC) med Hunpos för svenska + eng: Annotation of morphological features (SUC) by Hunpos for Swedish +task: morphosyntactic tagging +keywords: + - msd +annotations: + - :hunpos.msd +example_output: |- + ```xml + Det + här + är + en + korpus + . + ``` +model: "[suc3_suc-tags_default-setting_utf8.model](https://github.com/spraakbanken/sparv-models/blob/master/hunpos/suc3_suc-tags_default-setting_utf8.model?raw=true)" +description: + swe: |- + Meningssegment analyseras och annoteras med ordklasstaggar och morfosyntaktisk information. Ingår inte längre i + Sparvs standardanalyser eftersom Stanzas ordklassannotering ger bättre resultat. + eng: |- + Sentence segments are analysed to enrich tokens with part-of-speech tags and morphosyntactic information. No longer + used by default by Sparv because Stanza's POS-tagging yields better results. +created: 2010-12-15 +updated: 2018-05-28 +--- +id: swe-pos-hunpos-suc3-1800 +parent: hunpos-parent +name: + swe: SUC-ordklasstaggning med Hunpos för 1800-talssvenska + eng: SUC part-of-speech tagging with Hunpos for Swedish from the 1800's +short_description: + swe: Annotering av SUC-ordklasser med Hunpos för 1800-talssvenska + eng: Part-of-speech annotation with SUC tags by Hunpos for Swedish from the 1800's +task: part-of-speech tagging +keywords: + - pos-tagging +annotations: + - :hunpos.pos +example_output: |- + ```xml + Lådan + var + upphängd + under + den + waggon + hvari + de + andra + djuren + befunno + sig + . + ``` +model: |- + - [suc3_suc-tags_default-setting_utf8.model](https://github.com/spraakbanken/sparv-models/blob/master/hunpos/suc3_suc-tags_default-setting_utf8.model?raw=true) + - a word list along with the words' morphosyntactic information generated from the [Dalin + morphology](https://spraakbanken.gu.se/resurser/dalinm) and the [Swedberg + morphology](https://spraakbanken.gu.se/resurser/swedbergm) +description: + swe: |- + Meningssegment analyseras och annoteras med ordklasstaggar. Utöver ordklasstaggningsmodellen använder Hunpos listor + med böjningsformer för att kunna generera bättre ordklasstaggar för 1800-talssvenska. + eng: |- + Sentence segments are analysed to enrich tokens with part-of-speech tags. In addition to the pos model inflection + lists are provided to Hunpos to make more accuare part-of-speech predictions for Swedish from the 1800's. +created: 2012-10-23 +updated: 2015-09-11 +--- +id: swe-pos-hunpos-suc3-1800 +parent: hunpos-parent +name: + swe: SUC-ordklasstaggning med Hunpos för 1800-talssvenska + eng: SUC part-of-speech tagging with Hunpos for Swedish from the 1800's +short_description: + swe: Annotering av SUC-ordklasser med Hunpos för 1800-talssvenska + eng: Part-of-speech annotation with SUC tags by Hunpos for Swedish from the 1800's +task: part-of-speech tagging +keywords: + - pos-tagging +annotations: + - :hunpos.pos +example_output: |- + ```xml + Lådan + var + upphängd + under + den + waggon + hvari + de + andra + djuren + befunno + sig + . + ``` +model: |- + - [suc3_suc-tags_default-setting_utf8.model](https://github.com/spraakbanken/sparv-models/blob/master/hunpos/suc3_suc-tags_default-setting_utf8.model?raw=true) + - a word list along with the words' morphosyntactic information generated from the [Dalin + morphology](https://spraakbanken.gu.se/resurser/dalinm) and the [Swedberg + morphology](https://spraakbanken.gu.se/resurser/swedbergm) +description: + swe: |- + Meningssegment analyseras och annoteras med ordklasstaggar. Utöver ordklasstaggningsmodellen använder Hunpos listor + med böjningsformer för att kunna generera bättre ordklasstaggar för 1800-talssvenska. + eng: |- + Sentence segments are analysed to enrich tokens with part-of-speech tags. In addition to the pos model inflection + lists are provided to Hunpos to make more accuare part-of-speech predictions for Swedish from the 1800's. +created: 2012-10-23 +updated: 2015-09-11 +--- +id: swe-msd-hunpos-suc3-1800 +parent: hunpos-parent +name: + swe: Morfosyntaktisk SUC-taggning med Hunpos för 1800-talssvenska + eng: Tagging of morphological features (SUC) by Hunpos for Swedish from the 1800's +short_description: + swe: Annotering av morfosyntaktiska deskriptorer (SUC) med Hunpos för 1800-talssvenska + eng: Annotation of morphological features (SUC) by Hunpos for Swedish from the 1800's +task: morphosyntactic tagging +keywords: + - msd +annotations: + - :hunpos.msd +example_output: |- + ```xml + Lådan + var + upphängd + under + den + waggon + hvari + de + andra + djuren + befunno + sig + . + ``` +model: |- + - [suc3_suc-tags_default-setting_utf8.model](https://github.com/spraakbanken/sparv-models/blob/master/hunpos/suc3_suc-tags_default-setting_utf8.model?raw=true) + - a word list along with the words' morphosyntactic information generated from the [Dalin + morphology](https://spraakbanken.gu.se/resurser/dalinm) and the [Swedberg + morphology](https://spraakbanken.gu.se/resurser/swedbergm) +description: + swe: |- + Meningssegment analyseras och annoteras med ordklasstaggar och morfosyntaktisk information. Utöver + ordklasstaggningsmodellen använder Hunpos listor med böjningsformer för att kunna generera bättre ordklasstaggar för + 1800-talssvenska. + eng: |- + Sentence segments are analysed to enrich tokens with part-of-speech tags and morphosyntactic information. In + addition to the pos model inflection lists are provided to Hunpos to make more accuare part-of-speech predictions + for Swedish from the 1800's. +created: 2012-10-23 +updated: 2015-09-11 diff --git a/sparv/modules/readability/metadata.yaml b/sparv/modules/readability/metadata.yaml index c63386c7..e9d6adf2 100644 --- a/sparv/modules/readability/metadata.yaml +++ b/sparv/modules/readability/metadata.yaml @@ -5,7 +5,7 @@ language_codes: - swe keywords: - readability measures -other_references: '' +other_references: [] tool: '' model: '' trained_on: '' @@ -138,7 +138,7 @@ description: dividing this by the number of verbs, adverbs and pronouns. A high nominal ratio suggests a high density of information, which can also mean that the text is difficult to read. --- -id: swe-readability-sparv-nk +id: swe-readability-sparv-ovix parent: readability-parent name: swe: Annotering av Ordvariationsindex (OVIX) för texter