forked from Kadrian/paper-gamification
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdetex.py
executable file
·230 lines (214 loc) · 9.86 KB
/
detex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import re
testMode=False
def applyRegexps(text, listRegExp):
""" Applies successively many regexps to a text"""
if testMode:
print '\n'.join(listRegExp)
# apply all the rules in the ruleset
for element in listRegExp:
left = element['left']
right = element['right']
r=re.compile(left)
text=r.sub(right,text)
return text
"""
_ _ ____
__| | ___| |_ _____ __/ /\ \
/ _` |/ _ \ __/ _ \ \/ / | | |
| (_| | __/ || __/> <| | | |
\__,_|\___|\__\___/_/\_\ | | |
\_\/_/
"""
def detex(latexText):
"""Transform a latex text into a simple text"""
# initialization
regexps=[]
text=latexText
# remove all the contents of the header, ie everything before the first occurence of "\begin{document}"
text = re.sub(r"(?s).*?(\\begin\{document\})", "", text, 1)
# remove comments
regexps.append({r'left':r'([^\\])%.*', 'right':r'\1'})
text= applyRegexps(text, regexps)
regexps=[]
# - replace some LaTeX commands by the contents inside curly rackets
to_reduce = [r'\\emph', r'\\textbf', r'\\textit', r'\\text', r'\\IEEEauthorblockA', r'\\IEEEauthorblockN', r'\\author', r'\\caption',r'\\author',r'\\thanks']
for tag in to_reduce:
regexps.append({'left':tag+r'\{([^\}\{]*)\}', 'right':r'\1'})
text= applyRegexps(text, regexps)
regexps=[]
"""
_ _ _ _ _ _
| |__ (_) __ _| (_) __ _| |__ | |_
| '_ \| |/ _` | | |/ _` | '_ \| __|
| | | | | (_| | | | (_| | | | | |_
|_| |_|_|\__, |_|_|\__, |_| |_|\__|
|___/ |___/
"""
# - replace some LaTeX commands by the contents inside curly brackets and highlight these contents
to_highlight = [r'\\part[\*]*', r'\\chapter[\*]*', r'\\section[\*]*', r'\\subsection[\*]*', r'\\subsubsection[\*]*', r'\\paragraph[\*]*'];
# highlightment pattern: #--content--#
for tag in to_highlight:
regexps.append({'left':tag+r'\{([^\}\{]*)\}','right':r'\n#--\1--#\n'})
# highlightment pattern: [content]
to_highlight = [r'\\title',r'\\author',r'\\thanks',r'\\cite', r'\\ref'];
for tag in to_highlight:
regexps.append({'left':tag+r'\{([^\}\{]*)\}','right':r'[\1]'})
text= applyRegexps(text, regexps)
regexps=[]
"""
_ __ ___ _ __ ___ _____ _____
| '__/ _ \ '_ ` _ \ / _ \ \ / / _ \
| | | __/ | | | | | (_) \ V / __/
|_| \___|_| |_| |_|\___/ \_/ \___|
"""
# remove LaTeX tags
# - remove completely some LaTeX commands that take arguments
to_remove = [r'\\maketitle',r'\\footnote', r'\\centering', r'\\IEEEpeerreviewmaketitle', r'\\includegraphics', r'\\IEEEauthorrefmark', r'\\label', r'\\begin', r'\\end', r'\\big', r'\\right', r'\\left', r'\\documentclass', r'\\usepackage', r'\\bibliographystyle', r'\\bibliography', r'\\cline', r'\\multicolumn']
# replace tag with options and argument by a single space
for tag in to_remove:
regexps.append({'left':tag+r'(\[[^\]]*\])*(\{[^\}\{]*\})*', 'right':r' '})
#regexps.append({'left':tag+r'\{[^\}\{]*\}\[[^\]\[]*\]', 'right':r' '})
text= applyRegexps(text, regexps)
regexps=[]
"""
_
_ __ ___ _ __ | | __ _ ___ ___
| '__/ _ \ '_ \| |/ _` |/ __/ _ \
| | | __/ |_) | | (_| | (_| __/
|_| \___| .__/|_|\__,_|\___\___|
|_|
"""
# - replace some LaTeX commands by the contents inside curly rackets
# replace some symbols by their ascii equivalent
# - common symbols
regexps.append({'left':r'\\eg(\{\})* *','right':r'e.g., '})
regexps.append({'left':r'\\ldots','right':r'...'})
regexps.append({'left':r'\\Rightarrow','right':r'=>'})
regexps.append({'left':r'\\rightarrow','right':r'->'})
regexps.append({'left':r'\\le','right':r'<='})
regexps.append({'left':r'\\ge','right':r'>'})
regexps.append({'left':r'\\_','right':r'_'})
regexps.append({'left':r'\\\\','right':r'\n'})
regexps.append({'left':r'~','right':r' '})
regexps.append({'left':r'\\&','right':r'&'})
regexps.append({'left':r'\\%','right':r'%'})
regexps.append({'left':r'([^\\])&','right':r'\1\t'})
regexps.append({'left':r'\\item','right':r'\t- '})
regexps.append({'left':r'\\hline[ \t]*\\hline','right':r'============================================='})
regexps.append({'left':r'[ \t]*\\hline','right':r'_____________________________________________'})
# - special letters
regexps.append({'left':r'\\\'{?\{e\}}?','right':r'é'})
regexps.append({'left':r'\\`{?\{a\}}?','right':r'à'})
regexps.append({'left':r'\\\'{?\{o\}}?','right':r'ó'})
regexps.append({'left':r'\\\'{?\{a\}}?','right':r'á'})
# keep untouched the contents of the equations
regexps.append({'left':r'\$(.)\$', 'right':r'\1'})
regexps.append({'left':r'\$([^\$]*)\$', 'right':r'\1'})
# remove the equation symbols ($)
regexps.append({'left':r'([^\\])\$', 'right':r'\1'})
# correct spacing problems
regexps.append({'left':r' +,','right':r','})
regexps.append({'left':r' +','right':r' '})
regexps.append({'left':r' +\)','right':r'\)'})
regexps.append({'left':r'\( +','right':r'\('})
regexps.append({'left':r' +\.','right':r'\.'})
# remove lonely curly brackets
regexps.append({'left':r'^([^\{]*)\}', 'right':r'\1'})
regexps.append({'left':r'([^\\])\{([^\}]*)\}','right':r'\1\2'})
regexps.append({'left':r'\\\{','right':r'\{'})
regexps.append({'left':r'\\\}','right':r'\}'})
# strip white space characters at end of line
regexps.append({'left':r'[ \t]*\n','right':r'\n'})
# remove consecutive blank lines
regexps.append({'left':r'([ \t]*\n){3,}','right':r'\n'})
# apply all those regexps
text= applyRegexps(text, regexps)
regexps=[]
# return the modified text
return text
"""
_
_ __ ___ __ _(_)_ __
| '_ ` _ \ / _` | | '_ \
| | | | | | (_| | | | | |
|_| |_| |_|\__,_|_|_| |_|
"""
def main():
""" Just for debugging"""
#print "defining the test text\n"
latexText=r"""
% This paper can be formatted using the peerreviewca
% (instead of conference) mode.
\documentclass[twocolumn,a4paper]{article}
%\documentclass[peerreviewca]{IEEEtran}
% correct bad hyphenation here
\hyphenation{op-ti-cal net-works semi-con-duc-tor IEEEtran pri-va-cy Au-tho-ri-za-tion}
% package for printing the date and time (version)
\usepackage{time}
\begin{document}
\title{Next Generation Networks}
\author{Tot titi\thanks{Network and Security -- test company -- toto@ieee.org}}
\maketitle
\begin{abstract}\footnote{Version : \today ; \now}
lorem ipsum(\ldots)\end{abstract}
\emph{Keywords: IP Multimedia Subsystem, Quality of Service}
\section{Introduction} \label{sect:introduction}
lorem ipsum(\ldots) \% of the world population. \cite{TISPAN2006a}. \footnote{Bearer Independent Call Control protocol}.
\hline
\section{Protocols used in IMS} \label{sect:protocols}
lorem ipsum(\ldots) \cite{rfc2327, rfc3264}.
\subsection{Authentication, Authorization, and Accounting} \label{sect:protocols_aaa}
lorem ipsum(\ldots)
\subsubsection{Additional protocols} \label{sect:protocols_additional}
lorem ipsum(\ldots)
\begin{table}
\begin{center}
\begin{tabular}{|c|c|c|}
\hline
\textbf{Capability} & \textbf{UE} & \textbf{GGSN} \\ \hline
\emph{DiffServ Edge Function} & Optional & Required \\ \hline
\emph{RSVP/IntServ} & Optional & Optional \\ \hline
\emph{IP Policy Enforcement Point} & Optional & Required \\ \hline
\end{tabular}
\caption{IP Bearer Services Manager capability in the UE and GGSN}
\label{tab_ue_ggsn}
\end{center}
\end{table}
The main transport layer functions are listed below:
\begin{my_itemize}
\item The \emph{Resource Control Enforcement Function} (RCEF) enforces policies under the control of the A-RACF. It opens and closes unidirectional filters called \emph{gates} or \emph{pinholes}, polices traffic and marks IP packets \cite{TISPAN2006c}.
\item The \emph{Border Gateway Function} (BGF) performs policy enforcement and Network Address Translation (NAT) functions under the control of the S-PDF. It operates on unidirectional flows related to a particular session (micro-flows) \cite{TISPAN2006c}.
\item The \emph{Layer 2 Termination Point} (L2TP) terminates the Layer 2 procedures of the access network \cite{TISPAN2006c}.
\end{my_itemize}
Their QoS capabilities are summarized in table \ref{tab_rcef_bgf} \cite{TISPAN2006c}.
The admission control usually follows a three step procedure:
\begin{my_enumerate}
\item Authorization of resources (\eg by the A-RACF)
\item Resource reservation (\eg by the BGF)
\item Resource commitment (\eg by the RCEF)
\end{my_enumerate}
\begin{figure}
\centering
\includegraphics[width=1.5in]{./pictures/RACS_functional_architecture}
\caption{RACS interaction with transfer functions}
\label{fig_RACS_functional_architecture}
\end{figure}
%\subsection{Example} \label{sect:qos_example}
% conference papers do not normally have an appendix
% use section* for acknowledgement
\section*{Acknowledgment}
% optional entry into table of contents (if used)
%\addcontentsline{toc}{section}{Acknowledgment}
lorem ipsum(\ldots)
\bibliographystyle{plain}
%\bibliographystyle{alpha}
\bibliography{./mabiblio}
\end{document}
"""
#print '\n'.join(diff)
text=detex(latexText)
print text
if __name__ == "__main__":
main()