Skip to content

Commit

Permalink
Merge, many pipeline tweaks
Browse files Browse the repository at this point in the history
  • Loading branch information
dmrd committed May 11, 2014
2 parents d248906 + 2f65bc5 commit 3abd358
Show file tree
Hide file tree
Showing 10 changed files with 181 additions and 28 deletions.
12 changes: 7 additions & 5 deletions models/baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

import sys
import numpy as np
from sklearn.cross_validation import LeaveOneOut, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import LeaveOneOut, KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn import metrics

prefix = sys.argv[1]
Expand All @@ -24,6 +24,7 @@

# Files that need loading
data = [prefix + "_" + str(x) for x in range(n_types)]
# data = [prefix + "_" + str(n_types)]

# Read in files
lines = []
Expand Down Expand Up @@ -62,13 +63,14 @@
# Do actual classification
predictions = Y.copy()
clf = LogisticRegression()
# clf = LassoCV()
# for train, test in LeaveOneOut(len(Y)):
for i, (train, test) in enumerate(KFold(len(Y), n_folds=nfolds)):
for i, (train, test) in enumerate(StratifiedKFold(Y, n_folds=nfolds)):
print("Fold {}".format(i + 1))
clf.fit(X[train], Y[train])
predictions[test] = clf.predict(X[test])

print("Accuracy: {}".format(metrics.accuracy_score(Y, predictions)))

# print(metrics.classification_report(Y, predictions))
# print(metrics.confusion_matrix(Y, predictions))
print(metrics.classification_report(Y, predictions))
print(metrics.confusion_matrix(Y, predictions))
12 changes: 8 additions & 4 deletions models/slda/test/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from collections import defaultdict

def generate_documents(n_authors, n_topics, n_docs, n_words,
n_words_per_doc):
n_words_per_doc, divider=1.0):
"""
Implements generative process for LDA
Expand All @@ -28,9 +28,9 @@ def generate_documents(n_authors, n_topics, n_docs, n_words,
# Generate author dirichlet distributions over topics
author_p = []
for _ in range(n_authors):
x = np.random.rand(n_topics)
x = 0.01 + np.random.rand(n_topics) / divider
author_p.append(x)
print(x / x.sum())
print(x)

# Generate topic multinomial distributions over words
# (drawn from dirichlet)
Expand Down Expand Up @@ -104,14 +104,18 @@ def save_docs(docs, prefix, topic_type):
parser.add_argument('--n_words_per_doc',
help='Mean number of words per doc',
default=1000, type=int)
parser.add_argument('--divisor',
help='Divisor for dirichlet parameters',
default=1.0, type=float)
args = parser.parse_args()

# Write a file for each ngram type
type_docs = None
for i in range(args.n_types):
type_docs = generate_documents(args.n_authors, args.n_topics,
args.n_docs, args.n_words,
args.n_words_per_doc)
args.n_words_per_doc,
divider=args.divisor)
save_docs(type_docs, args.prefix, i)

# Go through once and write the label file
Expand Down
5 changes: 3 additions & 2 deletions models/slda/test/sanity_check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@ make -C .. clean
make -C ..
mkdir sanity_check sc
python generate_data.py --prefix sanity_check/sc --n_topics 4 --n_authors 4
../slda est 3 ./sanity_check/sc ./sanity_check/sc_labels ../settings.txt 0.1 random ./sc 4 4 4
../slda inf 3 ./sanity_check/sc ./sanity_check/sc_labels ../settings.txt ./sc/final.model ./sc
python ../../../pipeline/test_train_split.py ./sanity_check/sc 3 2
../slda est 3 ./sanity_check/fold0_train_sc ./sanity_check/fold0_train_sc_labels ../settings.txt 0.1 random ./sc 4 4 4
../slda inf 3 ./sanity_check/fold0_test_sc ./sanity_check/fold0_test_sc_labels ../settings.txt ./sc/final.model ./sc
8 changes: 4 additions & 4 deletions pipeline/copy_ngrams.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

# Copy over files
cp $1_etymology_4_author.txt $2_labels
cp $1_etymology_4_model.txt $2_0
cp $1_meter_4_model.txt $2_1
cp $1_pos_4_model.txt $2_2
cp $1_syllable_4_model.txt $2_3
cp $1_pos_4_model.txt $2_0
cp $1_syllable_4_model.txt $2_1
cp $1_etymology_4_model.txt $2_2
cp $1_meter_4_model.txt $2_3
cp $1_syllable_count_4_model.txt $2_4
cp $1_word_count_4_model.txt $2_5

Expand Down
2 changes: 1 addition & 1 deletion pipeline/run_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

NTYPES = 6

os.system("make -C ../models/slda clean && make -C ../models/slda")
#os.system("make -C ../models/slda clean && make -C ../models/slda")
os.system("mkdir output/{n} output/{n}/models output/{n}/data".format(n=data_name))
os.system("rm -f output/{n}/models/* output/{n}/data/*".format(n=data_name))
os.system("sh copy_ngrams.sh ../slda_input_files/{n} output/{n}/data/{n} {min_doc}".format(n=data_name,
Expand Down
2 changes: 1 addition & 1 deletion pipeline/test_train_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
print(sys.argv)

prefix = sys.argv[1]
n_types = int(sys.argv[2]) # 0.5
n_types = int(sys.argv[2]) # Number of ngram types (usually 6)
folds = int(sys.argv[3]) # Number of folds to use

# Files that need loading
Expand Down
30 changes: 30 additions & 0 deletions poster/poster.bib
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,36 @@ @inproceedings{Blei2007
year={2007}
}

@article{hoffman2013stochastic,
title={Stochastic variational inference},
author={Hoffman, Matthew D and Blei, David M and Wang, Chong and Paisley, John},
journal={The Journal of Machine Learning Research},
volume={14},
number={1},
pages={1303--1347},
year={2013},
publisher={JMLR. org}
}

@inproceedings{wang2009simultaneous,
title={Simultaneous image classification and annotation},
author={Wang, Chong and Blei, David and Li, Fei-Fei},
booktitle={Computer Vision and Pattern Recognition, 2009. CVPR 2009. IEEE Conference on},
pages={1903--1910},
year={2009},
organization={IEEE}
}


@inproceedings{rosen2004author,
title={The author-topic model for authors and documents},
author={Rosen-Zvi, Michal and Griffiths, Thomas and Steyvers, Mark and Smyth, Padhraic},
booktitle={Proceedings of the 20th conference on Uncertainty in artificial intelligence},
pages={487--494},
year={2004},
organization={AUAI Press}
}

@article{Blei2003,
title = {Latent {D}irichlet {A}llocation},
author={Blei, David M and Ng, Andrew Y and Jordan, Michael I},
Expand Down
15 changes: 7 additions & 8 deletions poster/poster.tex
Original file line number Diff line number Diff line change
Expand Up @@ -125,11 +125,9 @@
\begin{itemize}
\item A set of documents labeled (by author) on which to train.
\item A set of anonymized documents to classify.
\end{itemize}Methods for authorship detection have traditionally depended on lexical analysis of the text, making them relatively context-dependent.
\end{itemize} Methods for authorship detection traditionally depended on careful feature extraction and rather black-box methods. Hence, they rely on extensive domain specific knowledge, and can be difficult to decipher. Here, we present the \textit{MAD Topic Model}, which uses syntactic and stylometric n-gram features (e.g., part-of-speech tags, meter). MAD fits separate topic models to each of these ngram vocabularies, and then combines the models with a multiclass logistic regression classifier. After fitting the topic model parameters, new data can be classified using the multiclass component.

Instead, the \textit{MAD Topic Model} depends solely on syntactic and stylometric features (e.g., part-of-speech tags, meter), which are less context-dependent. MAD treats these features as vocabularies over which topic models can be determined, performing a Supervised Latent Dirichlet Allocation (SLDA) algorithm over $n$-gram stylistic features to determine authorship of anonymized text.

Preliminary results show significant improvement over more naive techniques (such as Logistic MLE) using the same features. As a by-product, MAD's topic models over the $n$-gram stylistic features can be used to extract compact representations of stylistic tendency and discern which features are most indicative of writing style.
INSERT GRAPHIC

\end{block}

Expand Down Expand Up @@ -239,8 +237,7 @@
\end{tikzpicture}
\caption{Graphical Model for the MAD Topic Model}
\end{figure}


\small The MAD topic model combines the SLDA algorithm presented in \cite{wang2009simultaneous} with the Author Topic Model in \cite{rosen2004author}, and extending both to account for multiple word types. The model is variational inference, following coordinate ascent updates in \cite{wang2009simultaneous}. Stochastic variational inference was also tested, but proved impractical for these rather small data sets.

\end{block}

Expand All @@ -256,7 +253,9 @@

\begin{block}{Results}

Our results.

\small
\indent Unfortunately, preliminary results show that which MAD fares far worse as using the same features with another classification scheme. This is consistent with \cite{...}, which suggests that a Pitman-Yor process better captures power law frequencies in language use than Dirichlet methods. Nevertheless, MAD's topic models over the $n$-gram stylistic features can be used to extract compact representations of stylistic tendency and discern which features are most indicative of individual writing style.

\end{block}

Expand Down Expand Up @@ -299,7 +298,7 @@
\setbeamercolor{block title}{fg=BreakfastRed,bg=white} % Change the block title color

\begin{block}{References}

\small
\bibliography{poster}
\bibliographystyle{plainnat}

Expand Down
115 changes: 115 additions & 0 deletions report/writeup.bib
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@ @inproceedings{Blei2007
pages={121--128},
year={2007}
}
@inproceedings{rosen2004author,
title={The author-topic model for authors and documents},
author={Rosen-Zvi, Michal and Griffiths, Thomas and Steyvers, Mark and Smyth, Padhraic},
booktitle={Proceedings of the 20th conference on Uncertainty in artificial intelligence},
pages={487--494},
year={2004},
organization={AUAI Press}
}

@article{Blei2003,
title = {Latent {D}irichlet {A}llocation},
Expand All @@ -16,6 +24,37 @@ @article{Blei2003
volume={3},
}

@inproceedings{wang2009simultaneous,
title={Simultaneous image classification and annotation},
author={Wang, Chong and Blei, David and Li, Fei-Fei},
booktitle={Computer Vision and Pattern Recognition, 2009. CVPR 2009. IEEE Conference on},
pages={1903--1910},
year={2009},
organization={IEEE}
}

@article{hoffman2013stochastic,
title={Stochastic variational inference},
author={Hoffman, Matthew D and Blei, David M and Wang, Chong and Paisley, John},
journal={The Journal of Machine Learning Research},
volume={14},
number={1},
pages={1303--1347},
year={2013},
publisher={JMLR. org}
}

@article{wainwright2008graphical,
title={Graphical models, exponential families, and variational inference},
author={Wainwright, Martin J and Jordan, Michael I},
journal={Foundations and Trends{\textregistered} in Machine Learning},
volume={1},
number={1-2},
pages={1--305},
year={2008},
publisher={Now Publishers Inc.}
}

@incollection{Porter,
author = {Porter, M. F.},
title = {An Algorithm for Suffix Stripping},
Expand Down Expand Up @@ -93,3 +132,79 @@ @article{Stein
address = {New York, NY, USA},
}
@article{brennan2012adversarial,
title={Adversarial stylometry: Circumventing authorship recognition to preserve privacy and anonymity},
author={Brennan, Michael and Afroz, Sadia and Greenstadt, Rachel},
journal={ACM Transactions on Information and System Security (TISSEC)},
volume={15},
number={3},
pages={12},
year={2012},
publisher={ACM}
}
@article{hughes2012empirical,
title={Empirical mode decomposition analysis for visual stylometry},
author={Hughes, James M and Mao, Dong and Rockmore, Daniel N and Wang, Yang and Wu, Qiang},
journal={Pattern Analysis and Machine Intelligence, IEEE Transactions on},
volume={34},
number={11},
pages={2147--2157},
year={2012},
publisher={IEEE}
}

@inproceedings{el2011computational,
title={A computational linguistic approach for the identification of translator stylometry using Arabic-English text},
author={El-Fiqi, Heba and Petraki, Eleni and Abbass, Hussein A},
booktitle={Fuzzy Systems (FUZZ), 2011 IEEE International Conference on},
pages={2039--2045},
year={2011},
organization={IEEE}
}
@article{stamatatos2009intrinsic,
title={Intrinsic plagiarism detection using character n-gram profiles},
author={Stamatatos, Efstathios},
journal={threshold},
volume={2},
pages={1--500},
year={2009}
}
@article{koppel2011authorship,
title={Authorship attribution in the wild},
author={Koppel, Moshe and Schler, Jonathan and Argamon, Shlomo},
journal={Language Resources and Evaluation},
volume={45},
number={1},
pages={83--94},
year={2011},
publisher={Springer}
}
@inproceedings{raghavan2010authorship,
title={Authorship attribution using probabilistic context-free grammars},
author={Raghavan, Sindhu and Kovashka, Adriana and Mooney, Raymond},
booktitle={Proceedings of the ACL 2010 Conference Short Papers},
pages={38--42},
year={2010},
organization={Association for Computational Linguistics}
}
@article{koppel2009computational,
title={Computational methods in authorship attribution},
author={Koppel, Moshe and Schler, Jonathan and Argamon, Shlomo},
journal={Journal of the American Society for information Science and Technology},
volume={60},
number={1},
pages={9--26},
year={2009},
publisher={Wiley Online Library}
}
@article{stamatatos2009survey,
title={A survey of modern authorship attribution methods},
author={Stamatatos, Efstathios},
journal={Journal of the American Society for information Science and Technology},
volume={60},
number={3},
pages={538--556},
year={2009},
publisher={Wiley Online Library}
}
8 changes: 5 additions & 3 deletions report/writeup.tex
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,13 @@ \section{Feature Extraction}

\section{Methods}



To explore our data we feed our extracted features as bags of n-grams to a novel LDA extension, the Multivalence Authorship Detection (MAD) Topic Model. The MAD topic model combines the SLDA algorithm presented in \cite{wang2009simultaneous} and \cite{Blei2007}, with the Author Topic Model in \cite{rosen2004author}, and extending both to account for multiple word types. For each word type $t$, MAD posits its own LDA topic model. Unlike conventional LDA, in which each document shares a common Dirichlet prior, MAD gives each author her own Dirichlet prior which can be optimized with coordinate descent. This differs from the Author Topic Model, which treats each author's oeuvre as one contiguous document.
Like SLDA, MAD has a multiclass regression parameter $\eta$, from which classes are drawn from $\text{softmax}(\eta^T\bar{z})$, where $\overline{z}$ are the average topic assignments for each work. The complete generative process is specified in the Appendix, and the graphical model is show in... The key innovation in this model is that it is doubly supervised: first, each author has her own topic proportions, which enforce shared topics between her documents. And second, upon conditioning on the multiclass logistic regression $\eta$, the topic assignments $z$ which contribute to correct classification are given a higher likelihood. Thus, one would expect that the \emph{more salient} features are selected for during inference. It is crucial to note that, during training, authorship is thereby treated as both a known label and random variable. In the test stage, however, we marginalize over authors, as described in the Appendix.
The model is fit with variational inference, discussed in \cite{wainwright2008graphical}. It is well known that exact inference for LDA requires computing a prohibitive integral \cite{Blei2003}. Instead, the posterior distribution $p(a,w,z,\theta|\alpha,\theta,\lambda,\eta)$ is approximated with a variational family: $q(\theta|\gamma)\prod_{n}q(z|\phi)$, indexed by parameters $\gamma$ and $\phi$. Here $\theta|\gamma \sim\text{Dirichlet}(\gamma)$ and $z|\phi \sim\text{Multi}(\phi)$, so that complete conditionals of $\theta$ and $z$ under $p$ are in the same family as their variational counterparts. Up to a constant independent of the variation parameters $\phi$ and $\gamma$, the KL divergence between $p$ and $q$ gives a lower bound on the posterior log likelihood. This is known as the ELBO, and (though nonconvex), can be optimized with coordinate-wise gradient ascent. The parameters of the model - notably the per-author topics - can be fit using maximum likelihood methods. There are a few subleties required to accomodate the multi-class supervision, which we defer to the appendix. The updates follow \cite{wang2009simultaneous} very closesly, and in the interest of brevity, are omitted.
The model was implemented in C++, and based upon Chong Wang's code accompanying \cite{wang2009simultaneous}. In addition to the MAD Topic Model, our code supports $L1$ penalization and stochastic variational inference. Neither extensions proved particularly effective, for reasons discussed in the appendix.
\section{Evaluation}


\section{Conclusions}

\newpage
\bibliography{writeup}
Expand Down

0 comments on commit 3abd358

Please sign in to comment.