Merge, many pipeline tweaks

dmrd · May 11, 2014 · 3abd358 · 3abd358
2 parents d248906 + 2f65bc5
commit 3abd358
Show file tree

Hide file tree

Showing 10 changed files with 181 additions and 28 deletions.
diff --git a/models/baseline.py b/models/baseline.py
@@ -6,8 +6,8 @@
 
 import sys
 import numpy as np
-from sklearn.cross_validation import LeaveOneOut, KFold
-from sklearn.linear_model import LogisticRegression
+from sklearn.cross_validation import LeaveOneOut, KFold, StratifiedKFold
+from sklearn.linear_model import LogisticRegression, LassoCV
 from sklearn import metrics
 
 prefix = sys.argv[1]
@@ -24,6 +24,7 @@
 
 # Files that need loading
 data = [prefix + "_" + str(x) for x in range(n_types)]
+# data = [prefix + "_" + str(n_types)]
 
 # Read in files
 lines = []
@@ -62,13 +63,14 @@
 # Do actual classification
 predictions = Y.copy()
 clf = LogisticRegression()
+# clf = LassoCV()
 # for train, test in LeaveOneOut(len(Y)):
-for i, (train, test) in enumerate(KFold(len(Y), n_folds=nfolds)):
+for i, (train, test) in enumerate(StratifiedKFold(Y, n_folds=nfolds)):
     print("Fold {}".format(i + 1))
     clf.fit(X[train], Y[train])
     predictions[test] = clf.predict(X[test])
 
 print("Accuracy: {}".format(metrics.accuracy_score(Y, predictions)))
 
-# print(metrics.classification_report(Y, predictions))
-# print(metrics.confusion_matrix(Y, predictions))
+print(metrics.classification_report(Y, predictions))
+print(metrics.confusion_matrix(Y, predictions))
diff --git a/models/slda/test/generate_data.py b/models/slda/test/generate_data.py
@@ -16,7 +16,7 @@
 from collections import defaultdict
 
 def generate_documents(n_authors, n_topics, n_docs, n_words,
-                       n_words_per_doc):
+                       n_words_per_doc, divider=1.0):
     """
     Implements generative process for LDA
 
@@ -28,9 +28,9 @@ def generate_documents(n_authors, n_topics, n_docs, n_words,
     # Generate author dirichlet distributions over topics
     author_p = []
     for _ in range(n_authors):
-        x = np.random.rand(n_topics)
+        x = 0.01 + np.random.rand(n_topics) / divider
         author_p.append(x)
-        print(x / x.sum())
+        print(x)
 
     # Generate topic multinomial distributions over words
     # (drawn from dirichlet)
@@ -104,14 +104,18 @@ def save_docs(docs, prefix, topic_type):
     parser.add_argument('--n_words_per_doc',
                         help='Mean number of words per doc',
                         default=1000, type=int)
+    parser.add_argument('--divisor',
+                        help='Divisor for dirichlet parameters',
+                        default=1.0, type=float)
     args = parser.parse_args()
 
     # Write a file for each ngram type
     type_docs = None
     for i in range(args.n_types):
         type_docs = generate_documents(args.n_authors, args.n_topics,
                                        args.n_docs, args.n_words,
-                                       args.n_words_per_doc)
+                                       args.n_words_per_doc,
+                                       divider=args.divisor)
         save_docs(type_docs, args.prefix, i)
 
     # Go through once and write the label file

diff --git a/models/slda/test/sanity_check.sh b/models/slda/test/sanity_check.sh
@@ -5,5 +5,6 @@ make -C .. clean
 make -C ..
 mkdir sanity_check sc
 python generate_data.py --prefix sanity_check/sc --n_topics 4 --n_authors 4
-../slda est 3 ./sanity_check/sc ./sanity_check/sc_labels ../settings.txt 0.1 random ./sc 4 4 4
-../slda inf 3 ./sanity_check/sc ./sanity_check/sc_labels ../settings.txt ./sc/final.model ./sc
+python ../../../pipeline/test_train_split.py ./sanity_check/sc 3 2
+../slda est 3 ./sanity_check/fold0_train_sc ./sanity_check/fold0_train_sc_labels ../settings.txt 0.1 random ./sc 4 4 4
+../slda inf 3 ./sanity_check/fold0_test_sc ./sanity_check/fold0_test_sc_labels ../settings.txt ./sc/final.model ./sc
diff --git a/pipeline/copy_ngrams.sh b/pipeline/copy_ngrams.sh
@@ -4,10 +4,10 @@
 
 # Copy over files
 cp $1_etymology_4_author.txt $2_labels
-cp $1_etymology_4_model.txt $2_0
-cp $1_meter_4_model.txt $2_1
-cp $1_pos_4_model.txt $2_2
-cp $1_syllable_4_model.txt $2_3
+cp $1_pos_4_model.txt $2_0
+cp $1_syllable_4_model.txt $2_1
+cp $1_etymology_4_model.txt $2_2
+cp $1_meter_4_model.txt $2_3
 cp $1_syllable_count_4_model.txt $2_4
 cp $1_word_count_4_model.txt $2_5
 

diff --git a/pipeline/run_model.py b/pipeline/run_model.py
@@ -22,7 +22,7 @@
 
 NTYPES = 6
 
-os.system("make -C ../models/slda clean && make -C ../models/slda")
+#os.system("make -C ../models/slda clean && make -C ../models/slda")
 os.system("mkdir output/{n} output/{n}/models output/{n}/data".format(n=data_name))
 os.system("rm -f output/{n}/models/* output/{n}/data/*".format(n=data_name))
 os.system("sh copy_ngrams.sh ../slda_input_files/{n} output/{n}/data/{n} {min_doc}".format(n=data_name,

diff --git a/pipeline/test_train_split.py b/pipeline/test_train_split.py
@@ -18,7 +18,7 @@
 print(sys.argv)
 
 prefix = sys.argv[1]
-n_types = int(sys.argv[2])  # 0.5
+n_types = int(sys.argv[2])  # Number of ngram types (usually 6)
 folds = int(sys.argv[3])  # Number of folds to use
 
 # Files that need loading

diff --git a/poster/poster.bib b/poster/poster.bib
@@ -7,6 +7,36 @@ @inproceedings{Blei2007
   year={2007}
 }
 
+@article{hoffman2013stochastic,
+  title={Stochastic variational inference},
+  author={Hoffman, Matthew D and Blei, David M and Wang, Chong and Paisley, John},
+  journal={The Journal of Machine Learning Research},
+  volume={14},
+  number={1},
+  pages={1303--1347},
+  year={2013},
+  publisher={JMLR. org}
+}
+
+@inproceedings{wang2009simultaneous,
+  title={Simultaneous image classification and annotation},
+  author={Wang, Chong and Blei, David and Li, Fei-Fei},
+  booktitle={Computer Vision and Pattern Recognition, 2009. CVPR 2009. IEEE Conference on},
+  pages={1903--1910},
+  year={2009},
+  organization={IEEE}
+}
+
+
+@inproceedings{rosen2004author,
+  title={The author-topic model for authors and documents},
+  author={Rosen-Zvi, Michal and Griffiths, Thomas and Steyvers, Mark and Smyth, Padhraic},
+  booktitle={Proceedings of the 20th conference on Uncertainty in artificial intelligence},
+  pages={487--494},
+  year={2004},
+  organization={AUAI Press}
+}
+
 @article{Blei2003,
   title = {Latent {D}irichlet {A}llocation},
   author={Blei, David M and Ng, Andrew Y and Jordan, Michael I},

diff --git a/poster/poster.tex b/poster/poster.tex
@@ -125,11 +125,9 @@
 \begin{itemize}
 \item A set of documents labeled (by author) on which to train.
 \item A set of anonymized documents to classify.
-\end{itemize}Methods for authorship detection have traditionally depended on lexical analysis of the text, making them relatively context-dependent.
+\end{itemize} Methods for authorship detection traditionally depended on careful feature extraction and rather black-box methods. Hence, they rely on extensive domain specific knowledge, and can be difficult to decipher. Here, we present the \textit{MAD Topic Model}, which uses  syntactic and stylometric n-gram features (e.g., part-of-speech tags, meter). MAD fits separate topic models to each of these ngram vocabularies, and then combines the models with a multiclass logistic regression classifier. After fitting the topic model parameters, new data can be classified using the multiclass component. 
 
-Instead, the \textit{MAD Topic Model} depends solely on syntactic and stylometric features (e.g., part-of-speech tags, meter), which are less context-dependent. MAD treats these features as vocabularies over which topic models can be determined, performing a Supervised Latent Dirichlet Allocation (SLDA) algorithm over $n$-gram stylistic features to determine authorship of anonymized text.
-
-Preliminary results show significant improvement over more naive techniques (such as Logistic MLE) using the same features. As a by-product, MAD's topic models over the $n$-gram stylistic features can be used to extract compact representations of stylistic tendency and discern which features are most indicative of writing style.
+INSERT GRAPHIC
 
 \end{block}
 
@@ -239,8 +237,7 @@
   \end{tikzpicture}
   \caption{Graphical Model for the MAD Topic Model}
 \end{figure}
-
-
+\small The MAD topic model combines the SLDA algorithm presented in \cite{wang2009simultaneous} with the Author Topic Model in \cite{rosen2004author}, and extending both to account for multiple word types. The model is variational inference, following coordinate ascent updates in \cite{wang2009simultaneous}. Stochastic variational inference was also tested, but proved impractical for these rather small data sets. 
 
 \end{block}
 
@@ -256,7 +253,9 @@
 
 \begin{block}{Results}
 
-Our results.
+
+\small
+\indent Unfortunately, preliminary results show that which MAD fares far worse as using the same features with another classification scheme. This is consistent with \cite{...}, which suggests that a Pitman-Yor process better captures power law frequencies in language use than Dirichlet methods. Nevertheless, MAD's topic models over the $n$-gram stylistic features can be used to extract compact representations of stylistic tendency and discern which features are most indicative of individual writing style.
 
 \end{block}
 
@@ -299,7 +298,7 @@
 \setbeamercolor{block title}{fg=BreakfastRed,bg=white} % Change the block title color
 
 \begin{block}{References}
-
+\small
 \bibliography{poster}
 \bibliographystyle{plainnat}
 

diff --git a/report/writeup.bib b/report/writeup.bib
@@ -6,6 +6,14 @@ @inproceedings{Blei2007
   pages={121--128},
   year={2007}
 }
+@inproceedings{rosen2004author,
+  title={The author-topic model for authors and documents},
+  author={Rosen-Zvi, Michal and Griffiths, Thomas and Steyvers, Mark and Smyth, Padhraic},
+  booktitle={Proceedings of the 20th conference on Uncertainty in artificial intelligence},
+  pages={487--494},
+  year={2004},
+  organization={AUAI Press}
+}
 
 @article{Blei2003,
   title = {Latent {D}irichlet {A}llocation},
@@ -16,6 +24,37 @@ @article{Blei2003
   volume={3},
 }
 
+@inproceedings{wang2009simultaneous,
+  title={Simultaneous image classification and annotation},
+  author={Wang, Chong and Blei, David and Li, Fei-Fei},
+  booktitle={Computer Vision and Pattern Recognition, 2009. CVPR 2009. IEEE Conference on},
+  pages={1903--1910},
+  year={2009},
+  organization={IEEE}
+}
+
+@article{hoffman2013stochastic,
+  title={Stochastic variational inference},
+  author={Hoffman, Matthew D and Blei, David M and Wang, Chong and Paisley, John},
+  journal={The Journal of Machine Learning Research},
+  volume={14},
+  number={1},
+  pages={1303--1347},
+  year={2013},
+  publisher={JMLR. org}
+}
+
+@article{wainwright2008graphical,
+  title={Graphical models, exponential families, and variational inference},
+  author={Wainwright, Martin J and Jordan, Michael I},
+  journal={Foundations and Trends{\textregistered} in Machine Learning},
+  volume={1},
+  number={1-2},
+  pages={1--305},
+  year={2008},
+  publisher={Now Publishers Inc.}
+}
+
 @incollection{Porter,
  author = {Porter, M. F.},
  title = {An Algorithm for Suffix Stripping},
@@ -93,3 +132,79 @@ @article{Stein
  address = {New York, NY, USA},
 } 
 
+
+@article{brennan2012adversarial,
+  title={Adversarial stylometry: Circumventing authorship recognition to preserve privacy and anonymity},
+  author={Brennan, Michael and Afroz, Sadia and Greenstadt, Rachel},
+  journal={ACM Transactions on Information and System Security (TISSEC)},
+  volume={15},
+  number={3},
+  pages={12},
+  year={2012},
+  publisher={ACM}
+}
+@article{hughes2012empirical,
+  title={Empirical mode decomposition analysis for visual stylometry},
+  author={Hughes, James M and Mao, Dong and Rockmore, Daniel N and Wang, Yang and Wu, Qiang},
+  journal={Pattern Analysis and Machine Intelligence, IEEE Transactions on},
+  volume={34},
+  number={11},
+  pages={2147--2157},
+  year={2012},
+  publisher={IEEE}
+}
+
+@inproceedings{el2011computational,
+  title={A computational linguistic approach for the identification of translator stylometry using Arabic-English text},
+  author={El-Fiqi, Heba and Petraki, Eleni and Abbass, Hussein A},
+  booktitle={Fuzzy Systems (FUZZ), 2011 IEEE International Conference on},
+  pages={2039--2045},
+  year={2011},
+  organization={IEEE}
+}
+@article{stamatatos2009intrinsic,
+  title={Intrinsic plagiarism detection using character n-gram profiles},
+  author={Stamatatos, Efstathios},
+  journal={threshold},
+  volume={2},
+  pages={1--500},
+  year={2009}
+}
+@article{koppel2011authorship,
+  title={Authorship attribution in the wild},
+  author={Koppel, Moshe and Schler, Jonathan and Argamon, Shlomo},
+  journal={Language Resources and Evaluation},
+  volume={45},
+  number={1},
+  pages={83--94},
+  year={2011},
+  publisher={Springer}
+}
+@inproceedings{raghavan2010authorship,
+  title={Authorship attribution using probabilistic context-free grammars},
+  author={Raghavan, Sindhu and Kovashka, Adriana and Mooney, Raymond},
+  booktitle={Proceedings of the ACL 2010 Conference Short Papers},
+  pages={38--42},
+  year={2010},
+  organization={Association for Computational Linguistics}
+}
+@article{koppel2009computational,
+  title={Computational methods in authorship attribution},
+  author={Koppel, Moshe and Schler, Jonathan and Argamon, Shlomo},
+  journal={Journal of the American Society for information Science and Technology},
+  volume={60},
+  number={1},
+  pages={9--26},
+  year={2009},
+  publisher={Wiley Online Library}
+}
+@article{stamatatos2009survey,
+  title={A survey of modern authorship attribution methods},
+  author={Stamatatos, Efstathios},
+  journal={Journal of the American Society for information Science and Technology},
+  volume={60},
+  number={3},
+  pages={538--556},
+  year={2009},
+  publisher={Wiley Online Library}
+}
diff --git a/report/writeup.tex b/report/writeup.tex
@@ -89,11 +89,13 @@ \section{Feature Extraction}
 
 \section{Methods}
 
-
-
+To explore our data  we feed our extracted features as bags of n-grams to a novel LDA extension, the Multivalence Authorship Detection (MAD) Topic Model. The MAD topic model combines the SLDA algorithm presented in \cite{wang2009simultaneous} and \cite{Blei2007}, with the Author Topic Model in \cite{rosen2004author}, and extending both to account for multiple word types.  For each word type $t$, MAD posits its own LDA topic model. Unlike conventional LDA, in which each document shares a common Dirichlet prior, MAD gives each author her own Dirichlet prior which can be optimized with coordinate descent. This differs from the Author Topic Model, which treats each author's oeuvre as one contiguous document.
+Like SLDA, MAD has a multiclass regression parameter $\eta$, from which classes are drawn from $\text{softmax}(\eta^T\bar{z})$, where $\overline{z}$ are the average topic assignments for each work. The complete generative process is specified in the Appendix, and the graphical model is show in... The key innovation in this model is that it is doubly supervised: first, each author has her own topic proportions, which enforce shared topics between her documents. And second, upon conditioning on the multiclass logistic regression $\eta$, the topic assignments $z$ which contribute to correct classification are given a higher likelihood.  Thus, one would expect that the \emph{more salient} features are selected for during inference. It is crucial to note that, during training, authorship is thereby treated as both a known label and random variable. In the test stage, however, we marginalize over authors, as described in the Appendix.
+The model is fit with variational inference, discussed in \cite{wainwright2008graphical}. It is well known that exact inference for LDA requires computing a prohibitive integral \cite{Blei2003}. Instead, the posterior distribution $p(a,w,z,\theta|\alpha,\theta,\lambda,\eta)$ is approximated with a variational family: $q(\theta|\gamma)\prod_{n}q(z|\phi)$, indexed by parameters $\gamma$ and $\phi$. Here $\theta|\gamma \sim\text{Dirichlet}(\gamma)$ and $z|\phi \sim\text{Multi}(\phi)$, so that complete conditionals of $\theta$ and $z$ under $p$ are in the same family as their variational counterparts. Up to a constant independent of the variation parameters $\phi$ and $\gamma$, the KL divergence between $p$ and $q$ gives a lower bound on the posterior log likelihood. This is known as the ELBO, and (though nonconvex), can be optimized with coordinate-wise gradient ascent. The parameters of the model - notably the per-author topics - can  be fit using maximum likelihood methods. There are a few subleties required to accomodate the multi-class supervision, which we defer to the appendix. The updates follow \cite{wang2009simultaneous} very closesly, and in the interest of brevity, are omitted. 
+The model was implemented in C++, and based upon Chong Wang's code accompanying \cite{wang2009simultaneous}. In addition to the MAD Topic Model, our code supports $L1$ penalization and stochastic variational inference. Neither extensions proved particularly effective, for reasons discussed in the appendix. 
 \section{Evaluation}
 
-
+\section{Conclusions}
 
 \newpage
 \bibliography{writeup}