-
Notifications
You must be signed in to change notification settings - Fork 2
/
refresh_corpus_data.sh
executable file
·45 lines (42 loc) · 1.39 KB
/
refresh_corpus_data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/bin/bash
# Receives a list of directories of UD repos to rerun
# Switches each repo to master before collecting data. Switches back to dev afterwards.
# This way we collect information relevant to the most recent official UD release (provided the repos are up-to-date).
if [[ "$1" == "--pull" ]] ; then
PULL=1
shift
else
PULL=0
fi
OUTDIR="_corpus_metadata"
mkdir -p $OUTDIR
for repo_dir in $*
do
echo ==================================================
echo $repo_dir
pushd $repo_dir
# In general, the UD front page should show information based on the master branch.
# However, for upcoming treebanks (not released before), the dev branch should be used instead.
repo_branch=master
git checkout master
if [ $PULL == 1 ] ; then
git pull
fi
if ls *.conllu 1> /dev/null 2>&1 ; then
echo conllu files found
else
echo conllu files not found, switching back to dev
repo_branch=dev
git checkout dev
if [ $PULL == 1 ] ; then
git pull
fi
fi
popd
echo $(basename $repo_dir)
python3 corpus_stats.py --readme-dir $repo_dir --repo-name $(basename $repo_dir) --repo-branch $repo_branch --codes-flags ./codes_and_flags.yaml --releases ./valdan/releases.json --json $repo_dir/*-ud-{train,dev,test}*.conllu > $OUTDIR/$(basename $repo_dir).json
pushd $repo_dir
git checkout dev
popd
echo "done"
done