forked from explosion/projects
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproject.yml
82 lines (73 loc) · 2.67 KB
/
project.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
title: "Named Entity Recognition (CoNLL-2003)"
vars:
config: "cnn_glove_small" # pick between "transformer" or "cnn_glove_small"
gpu: -1
# These are the directories that the project needs. The project CLI will make
# sure that they always exist.
directories: ["assets", "training", "configs", "metrics", "corpus"]
assets:
- url: "http://nlp.stanford.edu/data/glove.840B.300d.zip"
dest: "assets/vectors.zip"
description: "GloVe vectors"
- dest: "assets/conll2003/dev.iob"
description: "Development data (not available publicly so you have to add the file yourself)"
- dest: "assets/conll2003/test.iob"
description: "Test data (not available publicly so you have to add the file yourself)"
- dest: "assets/conll2003/train.iob"
description: "Training data (not available publicly so you have to add the file yourself)"
- url: "https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json"
dest: "assets/orth_variants.json"
description: "A file containing orth variants for data augmentation"
workflows:
all:
- vectors
- corpus
- train
- evaluate
commands:
- name: corpus
help: "Convert the data to spaCy's format"
# Make sure we specify the branch in the command string, so that the
# caching works correctly.
script:
- "python -m spacy convert assets/conll2003 corpus/"
deps:
- "assets/conll2003/dev.iob"
- "assets/conll2003/test.iob"
- "assets/conll2003/train.iob"
outputs:
- "corpus/train.spacy"
- "corpus/dev.spacy"
- "corpus/test.spacy"
- name: vectors
help: "Convert, truncate and prune the vectors."
script:
- "python -m spacy init vectors en assets/vectors.zip corpus/en_vectors -n en_glove840b_vectors_md"
deps:
- "assets/vectors.zip"
outputs:
- "corpus/en_vectors"
- name: train
help: "Train the full pipeline"
script:
- python -m spacy train configs/${vars.config}.cfg -o training/${vars.config} --gpu-id ${vars.gpu}
deps:
- "configs/${vars.config}.cfg"
- "corpus/en_vectors"
outputs:
- "training/${vars.config}/model-best"
- name: evaluate
help: "Evaluate on the test data and save the metrics"
script:
- "python -m spacy evaluate ./training/${vars.config}/model-best ./corpus/test.spacy --output ./metrics/${vars.config}.json --gpu-id ${vars.gpu} --gold-preproc"
deps:
- "training/${vars.config}/model-best"
- "corpus/test.spacy"
outputs:
- "metrics/${vars.config}.json"
- name: clean
help: "Remove intermediate files"
script:
- "rm -rf training/*"
- "rm -rf metrics/*"
- "rm -rf corpus/*"