-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathWORKFLOW.sh
62 lines (47 loc) · 3.3 KB
/
WORKFLOW.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# Steps to undertake to create and run the Factuality Classifier
# This is a full report of the steps done, to classify a new document
# skip to line 34.
#
# Author: Marieke van Erp
# Date: 18 September 2013
# Generate factuality.csv from the FactBank 1.0 database:
SELECT DISTINCT sentences.file, sentences.sentId, tml_event.eId, tml_event.eClass, fb_factValue.eText, fb_factValue.eId, tml_instance.tense, tml_instance.aspect, tml_instance.pos, tml_instance.polarity, fb_factValue.relSourceText, sentences.sent, fb_factValue.factValue INTO OUTFILE '/tmp/factuality.csv'
FIELDS TERMINATED BY '\t' OPTIONALLY ENCLOSED BY '"' ESCAPED BY '\\' LINES TERMINATED BY '\n' FROM sentences JOIN tml_event ON (sentences.file = tml_event.file AND sentences.sentId = tml_event.sentId) JOIN fb_relSource ON (sentences.file = fb_relSource.file AND sentences.sentId = fb_relSource.sentId) JOIN fb_factValue ON (tml_event.file = fb_factValue.file AND tml_event.sentId = fb_factValue.sentId AND tml_event.eId = fb_factValue.eId) JOIN tml_instance ON (tml_event.file = tml_instance.file AND tml_event.eId = tml_instance.eId)
# copy to your working directory
cp '/tmp/factuality.csv' .
# Generate the feature vectors:
# Note: factuality.csv is not included due to the FactBank data license.
perl NWRFactualityCreateFeatureVectors.pl > NWRFactualityFeatureVectors.csv
# You get better results without the nested sources
# to only consider unnested factuality values do:
grep "RELSOURCELEVEL1" < NWRFactualityFeatureVectors.csv > NWRFactualityFeatureVectorsOnlyRelSourceLevel1.csv
# Mallet doesn't take csv, so we're converting to ID LABEL DATA
# format, also for version 0.01 we're only using a window around the event
perl convertCSVtoMallet.pl NWRFactualityFeatureVectorsOnlyRelSourceLevel1.csv > FactBank.tab
# Generate a suitable input file for Mallet
# This command assumes that Mallet is located in the working directory
mallet-2.0.7/bin/mallet import-file --input FactBank.tab --output FactBank.vectors
# Train a MaxEnt classifier on the FactBank data
mallet-2.0.7/bin/mallet train-classifier --trainer MaxEnt --input FactBank.vectors --output-classifier MyMaxEntFactuality.classifier
################
#
# This is where the classification begins
#
################
# Create a begin timestamp so we can log how long it takes to process a file
perl beginTimestamp.pl
# Take a kaf file as input and generate the Mallet format (ID LABEL DATA)
perl NAFToMalletInputFactuality.pl FILE.naf > FILE.tab
# KAF will soon be deprecated, but if you still have a KAF file, you can use this command
#perl KAFToMalletInputFactuality.pl FILE.kaf > FILE.tab
# Classify the instances from the input file
mallet-2.0.7/bin/csv2classify --input FILE.tab --output FILE.output --classifier MyMaxEntFactuality.classifier
# Sort Mallet output and select the highest score
# Output the id, prediction and confidence
perl sortMalletOutput.pl FILE.output > FILE.sorted
# Read in output file as well as original KAF file and insert a factuality layer with factuality score, confidence and word (or term?) ID.
perl convertMalletToNAF.pl FILE.naf FILE.sorted > FILE.factuality.naf
# KAF will soon be deprecated, but if you still have a KAF file, you can use this command
#perl convertMalletToKAF.pl FILE.kaf FILE.sorted > FILE.factuality.kaf
# Clean up the begin timestamp
rm begintimestamp.txt