-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_speech_main.sh
executable file
·64 lines (55 loc) · 1.8 KB
/
extract_speech_main.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env bash
usage(){
echo "Usage: $0 [-x|-c|-p]"
echo ""
echo " -x Extract speech turns (without stage directions), basic character"
echo " metadata from TEI header and more social metadata from personography"
echo " (one dataframe per play)."
echo " -c Collect the per-play dataframes into a single dataframe for the corpus"
echo " This also triggers a postprocessing (recoding etc.) in the whole-corpus df."
echo " -p Postprocess the whole-corpus df (some recoding and corrections)"
echo ""
}
while getopts "xcp" opt; do
case "$opt" in
x) EXTRACT=1
;;
c) COLLECT=1
;;
p) POSTPROCESS=1
;;
*) usage
;;
esac
done
#https://unix.stackexchange.com/a/50648
#echo $OPTIND
[[ $OPTIND -eq 1 ]] && usage
# run per-play extraction if needed
if [[ $EXTRACT == 1 ]]; then
echo "Extract per play"
./commands/command_list_pre_treatment.sh
fi
# collect per-play dataframes into a single df for the entire corpus
if [[ $COLLECT == 1 ]]; then
# avoid forgetting postprocess
POSTPROCESS=1
echo "Collect to single dataframe"
outdir=$(grep dir_path pre_treatment/script/extract_character_speech.py | \
head -n1 | grep -Po '".+?"' | sed 's/"//g')
echo $outdir
outdf_all=overall-per-character-speech.tsv
# write out the header once
header=$(head -n1 $outdir/* |grep -v '=' | sed 's/\n//g'| sort| uniq)
echo -e "Output columns" $header | sed 's/ /\t/g'
echo -e $header | sed 's/ /\t/g' > $outdf_all
# write out each play's df from line 2 onwards
for fn in $(ls $outdir); do
tail --lines +2 $outdir/$fn | grep -Pv '^\n' >> $outdf_all
done
fi
# postprocess df
if [[ $POSTPROCESS == 1 ]]; then
echo "Postprocess dataframe"
python ./pre_treatment/script/postprocess_character_speech_df.py
fi