diff --git a/DataScienceEssentials.tex b/DataScienceEssentials.tex index 52e5565..e07ceac 100644 --- a/DataScienceEssentials.tex +++ b/DataScienceEssentials.tex @@ -41,6 +41,7 @@ \mainmatter % LABS ============================================================ +\part{Labs} \subimport{./DataScienceEssentials/SQL1/}{SQL1} \subimport{./DataScienceEssentials/SQL2/}{SQL2} % \subimport{./DataScienceEssentials/UnixShell1/}{UnixShell1} @@ -57,6 +58,13 @@ % \subimport{./DataScienceEssentials/Parallel_Intro/}{parallel1} % \subimport{./DataScienceEssentials/MPI/}{mpi} +\part{Appendices} % Relevant Appendices --------------------------------------- +\begin{appendices} +\subimport{./Appendices/Setup/}{SetupStudent} +\subimport{./Appendices/Installation/}{Installation} +\subimport{./Appendices/NumpyVisualGuide/}{NumpyVisualGuide} +\end{appendices} + % TODO: uncomment these when we have citations in this manual. % check with `find DataScienceEssentials -type f -name "*.tex" | xargs grep --color "cite{"` % \bibliographystyle{alpha} diff --git a/DataScienceEssentials/RegularExpressions/RegularExpressions.tex b/DataScienceEssentials/RegularExpressions/RegularExpressions.tex index 2b462fd..4391df9 100644 --- a/DataScienceEssentials/RegularExpressions/RegularExpressions.tex +++ b/DataScienceEssentials/RegularExpressions/RegularExpressions.tex @@ -4,59 +4,46 @@ This lab introduces regular expression syntax and common practices, including an application to a data cleaning problem. } -% TODO: use fullmatch() instead of search() in the examples up to the end of the regex syntax section. - % TODO: Important links! % \url{https://docs.python.org/3/howto/regex.html} % \url{https://docs.python.org/3/library/re.html} - % As an example, the following program will find and fill in missing colons after control statements in normally formatted python files: - % % ^((def|if|elif|else|while|for|with).*):\s*$ - A \emph{regular expression} or \emph{regex} is a string of characters that follows a certain syntax to specify a pattern. Strings that follow the pattern are said to \emph{match} the expression (and vice versa). -A single regular expression can match a large set of strings, such as the set all valid email addresses. -% The idea is similar to using wildcards like \li{*} in the Unix shell: the command \li{ls *.py} displays \textbf{all} files that have a \texttt{.py} file extension. +A single regular expression can match a large set of strings, such as the set of all valid email addresses. \begin{warn} There are some universal standards for regular expression syntax, but the exact syntax varies slightly depending on the program or language. However, the syntax presented in this lab (for Python) is sufficiently similar to any other regex system. -Consider learning to use regular expressions in vim or your favorite text editor, keeping in mind that there are bound to be slight syntactic differences from what is presented here. +Consider learning to use regular expressions in Vim or your favorite text editor, keeping in mind that there will be slight syntactic differences from what is presented here. \end{warn} \section*{Regular Expression Syntax in Python} % ============================== The \li{re} module implements regular expressions in Python. The function \li{re.<>()} takes in a regular expression string and returns a corresponding \emph{pattern} object, which has methods for determining if and how other strings match the pattern. -For example, the \li{search()} method returns \li{None} for a string that doesn't match, and a \emph{match} object for a string that does (more on these later). +For example, the \li{search()} method returns \li{None} for a string that doesn't match, and a \emph{match} object for a string that does. + +Note the \li{match()} method for pattern objects only matches strings that satisfy the pattern \textbf{at the beginning} of the string. +To answer the question ``does any part of my target string match this regular expression?'' always use the \li{search()} method. \begin{lstlisting} >>> import re >>> pattern = re.<>("cat") # Make a pattern for finding 'cat'. >>> bool(pattern.search("cat")) # 'cat' matches 'cat', of course. <> ->>> bool(pattern.search("catfish")) # 'catfish' also contains 'cat'. +>>> bool(pattern.match("catfish")) # 'catfish' starts with 'cat'. <> ->>> bool(pattern.search("hat")) # 'hat' does not contain 'cat'. +>>> bool(pattern.match("fishcat")) # 'fishcat' doesn't start with 'cat'. <> -\end{lstlisting} - -\begin{warn} % re.match() only matches the beginning! -The poorly named \li{match()} method for pattern objects only matches strings that satisfy the pattern \textbf{at the beginning} of the string. -To answer the question ``does any part of my target string match this regular expression?'' always use the \li{search()} method. -\begin{lstlisting} ->>> pattern = re.<>("cat") ->>> bool(pattern.match("catfish")) +>>> bool(pattern.search("fishcat")) # but it does contain 'cat'. <> ->>> bool(pattern.match("fishcat")) +>>> bool(pattern.search("hat")) # 'hat' does not contain 'cat'. <> ->>> bool(pattern.search("fishcat")) -<> \end{lstlisting} -\end{warn} -\begin{info} % re.search() vs. re.compile().search() Most of the functions in the \li{re} module are shortcuts for compiling a pattern object and calling one of its methods. +Using \li{re.<>()} is good practice because the resulting object is reusable, while each call to \li{re.search()} compiles a new (but redundant) pattern object. For example, the following lines of code are equivalent. \begin{lstlisting} >>> bool(re.<>("cat").search("catfish")) @@ -64,20 +51,10 @@ \section*{Regular Expression Syntax in Python} % ============================== >>> bool(re.search("cat", "catfish")) <> \end{lstlisting} -Using \li{re.<>()} is good practice because the resulting object is reusable, while each call to \li{re.search()} compiles a new (but redundant) pattern object. -\end{info} + \begin{problem} % Easy regular expression, construct test cases. Write a function that compiles and returns a regular expression pattern object with the pattern string \li{"python"}. - -Construct positive and negative test cases to test your object. -Having good test cases will be important later, so be thorough. -Your verification process might start as follows. -\begin{lstlisting} ->>> pattern = re.<>("cat") ->>> positive = ["cat", "catfish", "fish cat", "your cat ran away"] ->>> assert all(pattern.search(p) for p in positive) -\end{lstlisting} \label{prob:regex-superbasic} \end{problem} @@ -86,24 +63,16 @@ \subsection*{Literal Characters and Metacharacters} % ------------------------- The following string characters (separated by spaces) are \emph{metacharacters} in Python's regular expressions, meaning they have special significance in a pattern string: \li{. ^ \$ * + ? \{ \} [ ] \\ | ( )}. -To construct a regular expression that matches strings with one or more metacharacters in them requires two things. -First, use \emph{raw strings} instead of regular Python strings by prefacing the string with an \li{r}, such as \li{r"cat"}. +A regular expression that matches strings with one or more metacharacters requires two things. +\begin{enumerate} + \item Use \emph{raw strings} instead of regular Python strings by prefacing the string with an \li{r}, such as \li{r"cat"}. The resulting string interprets backslashes as actual backslash characters, rather than the start of an escape sequence like \li{\\n} or \li{\\t}. -Second, preface any metacharacters with a backslash to indicate a literal character. -For example, the following code constructs a regular expression to match the string \li{"\$3.99? Thanks."}. - -\begin{lstlisting} ->>> dollar = re.<>(r"\$3\.99\? Thanks\.") ->>> bool(dollar.search("$3.99? Thanks.")) -<> ->>> bool(dollar.search("$3\.99? Thanks.")) -<> ->>> bool(dollar.search("$3.99?")) # Doesn't contain the entire pattern. -<> -\end{lstlisting} + \item Preface any metacharacters with a backslash to indicate a literal character. +For example, to match the string \li{"\$3.99? Thanks."}, use \li{r"\\\$3\\.99\\? Thanks\\."}. +\end{enumerate} Without raw strings, every backslash in has to be written as a double backslash, which makes many regular expression patterns hard to read (\li{"\\\\\$3\\\\.99\\\\? Thanks\\\\."}). -Readability counts. + \begin{problem} Write a function that compiles and returns a regular expression pattern object that matches the string \li{"^\{@\}(?)[\%]\{.\}(*)[_]\{&\}\$"}. @@ -111,58 +80,15 @@ \subsection*{Literal Characters and Metacharacters} % ------------------------- \end{problem} The regular expressions of Problems \ref{prob:regex-superbasic} and \ref{prob:regex-metacharacter-literals} only match strings that are or include the exact pattern. -The metacharacters allow regular expressions to have much more flexibility and control so that a single pattern could match a wide variety of strings, or a very specific set of strings. - -To begin, the \emph{line anchor} metacharacters \li{^} and \li{\$} are used to match the \textbf{start} and the \textbf{end} of a line of text, respectively. +The metacharacters allow regular expressions to have much more flexibility and control so that a single pattern can match a wide variety of strings, or a very specific set of strings. +The \emph{line anchor} metacharacters \li{^} and \li{\$} are used to match the \textbf{start} and the \textbf{end} of a line of text, respectively. This shrinks the matching set, even when using the \li{search()} method instead of the \li{match()} method. -For example, the only single-line string that the expression \li{^x\$} matches is \li{'x'}, whereas the expression \li{x} can match any string with an \li{x} in it. - -\begin{lstlisting} ->>> has_x, just_x = re.<>(r"x"), re.<>(r"^x$") ->>> for test in ["x", "xabc", "abcx"]: -... print(test + ':', bool(has_x.search(test)), bool(just_x.search(test))) -... -<> # Starts with 'x', but doesn't end with it. -<> # Ends with 'x', but doesn't start with it. -\end{lstlisting} - -% TODO: put this back in later. -\begin{comment} -An added benefit of using \li{'^'} and \li{'\$'} is that they allow you to search across multiple lines. -For example, how would we match \li{"World"} in the string \mbox{\li{"Hello\\nWorld"}}? -Using \li{re.MULTILINE} in the \li{re.search} function will allow us to match at the beginning of each new line, instead of just the beginning of the string. -Since we have seen two ways to match strings with regex expressions, the following shows two ways to implement multiline searching: - -\begin{lstlisting} ->>>bool(re.search("^W","Hello\nWorld")) -<> ->>>bool(re.search("^W","Hello\nWorld", re.MULTILINE)) -<> ->>>pattern1 = re<<.compile>>("^W") ->>>pattern2 = re<<.compile>>("^W", re.MULTILINE) ->>>bool(pattern1.search("Hello\nWorld")) -<> ->>>bool(pattern2.search("Hello\nWorld")) -<> -\end{lstlisting} - -For simplicity, the rest of the lab will focus on single line matching. -\end{comment} - -The \emph{pipe} character \li{|} is like a logical \li{OR} in a regular expression: \li{A|B} matches \li{A} or \li{B}. -\begin{lstlisting} ->>> rb, rbg = re.<>(r"^red$|^blue$"), re.<>(r"^red$|^blue$|^green$") ->>> for test in ["red", "blue", "green", "redblue"]: -... print(test + ":", bool(rb.search(test)), bool(rbg.search(test))) -<> # The line anchors prevent matching here. -\end{lstlisting} +For example, the only single-line string that the expression \li{'^x\\\$'} matches is \li{'x'}, whereas the expression \li{'x'} can match any string with an \li{'x'} in it. +The \emph{pipe} character \li{|} is a logical \li{OR} in a regular expression: \li{A|B} matches \li{A} or \li{B}. The parentheses \li{()} create a \emph{group} in a regular expression. -Among other things, a group establishes an order of operations in an expression, much like how parentheses work in an arithmetic expression such as $3\cdot(4+5)$. +A group establishes an order of operations in an expression. +For example, in the regex \li{"^one|two fish\$"}, precedence is given to the invisible string concatenation between \li{"two"} and \li{"fish"}, while \li{"^(one|two) fish\$"} gives precedence to the \li{'\|'} metacharacter. \begin{lstlisting} >>> fish = re.<>(r"^(one|two) fish$") @@ -175,11 +101,8 @@ \subsection*{Literal Characters and Metacharacters} % ------------------------- one two fish: False>> \end{lstlisting} -% Parentheses help give regular expressions higher precedence. -% For example, \li{"^one|two fish\$"} gives precedence to the invisible string concatenation between \li{"two"} and \li{"fish"} while \li{"^(one|two) fish\$"} gives precedence to the \li{'\|'} metacharacter. - \begin{problem} -Write a function that compiles and returns a regular expression pattern object that matches the following strings (and no other strings, even with \li{re.search()}). +Write a function that compiles and returns a regular expression pattern object that matches the following strings, and no other strings, even with \li{re.search()}. \centering \begin{tabular}{lll} @@ -200,7 +123,7 @@ \subsection*{Character Classes} % --------------------------------------------- In other words, \li{^} is the set complement operation on the character class. Additionally, the dash \li{-} specifies a range of values. For instance, \li{[0-9]} matches any digit, and \li{[a-z]} matches any lowercase letter. -Thus \li{[^0-9]} matches any character \textbf{except} for a digit, and \li{[^a-z]} matches any character \textbf{except} for a lowercase letters +Thus \li{[^0-9]} matches any character \textbf{except} for a digit, and \li{[^a-z]} matches any character \textbf{except} for lowercase letters. Keep in mind that the dash \li{-}, when at the beginning or end of the character class, will match the literal \li{'-'}. \begin{lstlisting} >>> p1, p2 = re.<>(r"^[a-z][^0-7]$"), re.<>(r"^[^abcA-C][0-27-9]$") @@ -224,10 +147,10 @@ \subsection*{Character Classes} % --------------------------------------------- Character & Description \\ \hline % \li{\\number} & Matches the contents of the group of the same number. \\ \li{\\b} & Matches the empty string, but only at the start or end of a word. \\ -\li{\\d} & Matches any decimal digit; equivalent to \li{[0-9]}. \\ -\li{\\D} & Matches any non-digit character; equivalent to \li{[^\\d]}. \\ \li{\\s} & Matches any whitespace character; equivalent to \li{[ \\t\\n\\r\\f\\v]}. \\ \li{\\S} & Matches any non-whitespace character; equivalent to \li{[^\\s]}. \\ +\li{\\d} & Matches any decimal digit; equivalent to \li{[0-9]}. \\ +\li{\\D} & Matches any non-digit character; equivalent to \li{[^\\d]}. \\ \li{\\w} & Matches any alphanumeric character; equivalent to \li{[a-zA-Z0-9_]}. \\ \li{\\W} & Matches any non-alphanumeric character; equivalent to \li{[^\\w]}. \\ % \li{\\\\} & Matches a literal backslash. \\ @@ -239,7 +162,7 @@ \subsection*{Character Classes} % --------------------------------------------- Any of the character class shortcuts can be used within other custom character classes. For example, \li{[\_A-Z\\s]} matches an underscore, capital letter, or whitespace character. -Finally, a period \li{.} matches \textbf{any} character except for a line break, and is therefore equivalent to \li{[\^\\n]} on UNIX machines and \li{[\^\\r\\n]} on Windows machines. +Finally, a period \li{.} matches \textbf{any} character except for a line break. This is a very powerful metacharacter; be careful to only use it when part of the regular expression really should match \textbf{any} character. \begin{lstlisting} @@ -265,6 +188,8 @@ \subsection*{Character Classes} % --------------------------------------------- Boba?: False>> \end{lstlisting} +The following table is a useful recap of some common regular expression metacharacters. + \begin{table}[H] \begin{tabular}{c|l} Character & Description \\ \hline @@ -300,45 +225,6 @@ \subsection*{Repetition} % ---------------------------------------------------- \label{table:regex-special-characters2} \end{table} - -% The \li{'*'} metacharacter means ``Match zero or more times (\textbf{as many as possible})'' when it follows another regular expression. -% The \li{'+'} metacharacter means ``Match one or more times (as many as possible)'' when it follows another regular expression. -% The \li{'?'} metacharacter means ``Match one time (if possible) or do nothing (i.e. match zero times)'' when it follows another regular expression: -% The curly brace metacharacters are used to specify a more precise amount of repetition: - -\begin{lstlisting} -# Match 0 or more 'a' characters, ending in a 'b'. ->>> pattern = re.<>(r"^a*b$") ->>> for test in ["b", "ab", "aaaaaaaaaab", "aba"]: -... print(test + ':', bool(pattern.search(test))) -... -<> # 0 'a' characters, then 1 'b'. -<> -<> # Several 'a' characters, then 1 'b'. -<> # 'b' must be the last character. - -# Match an 'h' followed by at least one 'i' or 'a' characters. ->>> pattern = re.<>(r"^h[ia]+$") ->>> for test in ["ha", "hii", "hiaiaa", "h", "hah"]: -... print(test + ':', bool(pattern.search(test))) -... -<> -<> # [ia] matches 'i' or 'a'. -<> # Need at least one 'i' or 'a' -<> # 'i' or 'a' must be the last character. - -# Match an 'a' followed by 'b' followed by 0 or 1 'c' characters. ->>> pattern = re.<>(r"^abc?$") ->>> for test in ["ab", "abc", "abcc", "ac"]: -... print(test + ':', bool(pattern.search(test))) -... -<> -<> # Only up to one 'c' is allowed. -<> # Missing the 'b'. -\end{lstlisting} - Each of the repetition operators acts on the expression immediately preceding it. This could be a single character, a group, or a character class. For instance, \li{(abc)+} matches \li{abc}, \li{abcabc}, \li{abcabcabc}, and so on, but not \li{aba} or \li{cba}. @@ -360,8 +246,8 @@ \subsection*{Repetition} % ---------------------------------------------------- <> \end{lstlisting} -\begin{warn} % Use line anchors with repetition operators. -Line anchors are especially important when using repetition operators. + +Be aware that line anchors are especially important when using repetition operators. Consider the following (bad) example and compare it to the previous example. \begin{lstlisting} @@ -377,28 +263,12 @@ \subsection*{Repetition} % ---------------------------------------------------- \end{lstlisting} The unexpected matches occur because \li{"aaa"} is at the beginning of each of the test strings. With the line anchors \li{^} and \li{\$}, the search truly only matches the exact string \li{"aaa"}. -\end{warn} -\begin{problem} -A \emph{valid Python identifier} (a valid variable name) is any string staring with an alphabetic character or an underscore, followed by any (possibly empty) sequence of alphanumeric characters and underscores. -Define a function that compiles and returns a regular expression pattern object that matches any valid Python identifier. -\\(Hint: Use the \li{\\w} character class shortcut to keep your regular expression clean.) - -Check your regular expression against the following words. -These test cases are a good start, but are not exhaustive. - -\centering -\begin{tabular}{c|lllll} -Matches: & \li{"Mouse"} & \li{"compile"} & \li{"_123456789"} & \li{"__x__"} & \li{"while"} \\ \hline -Non-matches: & \li{"3rats"} & \li{"err*r"} & \li{"sq(x)"} & \li{"sleep()"} & \li{" x"} -\end{tabular} -% As you might have noticed, using this definition, \li{"while"} is considered a valid python identifier, even though it really is a reserved word. In the following problems, we will make a few other simplifying assumptions about the python language. -\end{problem} +\begin{problem} +A \emph{valid Python identifier} (a valid variable name) is any string starting with an alphabetic character or an underscore, followed by any (possibly empty) sequence of alphanumeric characters and underscores. -\begin{comment} % Might be worth putting this in... -\begin{problem} A \emph{valid python parameter definition} is defined as the concatenation of the following strings: \begin{itemize} \item any valid python identifier @@ -406,82 +276,29 @@ \subsection*{Repetition} % ---------------------------------------------------- \item (optional) an equals sign followed by any number of spaces and ending with one of the following: any real number, a single quote followed by any number of non-single-quote characters followed by a single quote, or any valid python identifier \end{itemize} -Define a variable \li{parameter_pattern_string} that defines a regular expression that matches valid python parameter definitions. - -For example, each element of \li{["max=4.2", "string= ''", "num_guesses", "msg ='\\\\'", "volume_fn=_CALC_VOLUME"]} is a valid python parameter definition, while each element of \li{["300", "no spaces", "is_4=(value==4)", "pattern = r'^one|two fish\$'", 'string="these last two are actually valid in python, but they should not be matched by your pattern"']} is not. % TODO add more negative examples maybe? -\end{problem} -\end{comment} - - -\begin{comment} % Might be worth putting this in... -\begin{problem} -A \emph{valid python function definition} is defined as the concatenation of the following strings: -\begin{itemize} - \item \li{"def"} - \item Any number of spaces - \item any valid python identifier - \item \li{"("} - \item a sequence of any number of (possibly zero) valid python parameter definitions, separated by any number of spaces followed by a comma followed by any number of spaces - \item \li{")"} - \item \li{":"} -\end{itemize} -with any number of spaces between each element of the above list. - -Define a variable \li{function_pattern_string} that defines a regular expression that matches valid python function definitions. - -For example, the program should behave as follows: -\begin{lstlisting} ->>> run match_function_definition.py -Enter a string>>> def compile(pattern,string): -<> -Enter a string>>> def space ( ) : -<> -Enter a string>>> def func(_dir, file_path='\Desktop\files', val=_PI): -<> -Enter a string>>> def func(num=3., num=.5, num=0.0): -<> -Enter a string>>> def func(num=., error,): -<> -Enter a string>>> def variable: -<> -Enter a string>>> def not.allowed(, *args): -<> -Enter a string>>> def err*r('no parameter name'): -<> -Enter a string>>> def func(value=_MY_CONSTANT, msg='%s' % _DEFAULT_MSG): -<> -Enter a string>>> def func(s1='', a little tricky, s2=''): -<> -Enter a string>>> def func(): Remember your line anchors! -<> -Enter a string>>> deffunc() -<> -Enter a string>>> func(): -<> -Enter a string>>> exit - -\end{lstlisting} +Define a function that compiles and returns a regular expression pattern object that matches any valid Python parameter definition. +\\(Hint: Use the \li{\\w} character class shortcut to keep your regular expression clean.) -\begin{warn} -In the end, my variable \li{function_pattern_string} was \emph{215 characters long}. You WILL make a mistake while defining \li{function_pattern_string}; do you want to try to debug a 215-character regular expression? Do NOT try to define it all at once! +To help in debugging, the following examples may be useful. +These test cases are a good start, but are not exhaustive. +The first table should match valid Python identifiers. +The second should match a valid python parameter definition, as defined in this problem. +Note that some strings which would be valid in python will not be for this problem. -Instead, use your previously defined regular expressions to make this easier. For example, either of the two following idioms will work: -\begin{lstlisting} ->>> key_1 = "basic" ->>> print("This is a " + key_1 + " way to concatenate strings.") -This is a basic way to concatenate strings. ->>> format_dict = {"key_1": "basic", "key_2": "much more", "key_3": "advanced"} ->>> print("This is a {key_2} {key_3} way to concatenate strings. It's {key_2} flexible.".format(**format_dict)) -This is a much more advanced way to concatenate strings. It's much more flexible. -\end{lstlisting} -Keep in mind that you'll have to remove the line anchors from your previously defined regular expressions. +\centering +\begin{tabular}{c|lllll} +Matches: & \li{"Mouse"} & \li{"compile"} & \li{"_123456789"} & \li{"__x__"} & \li{"while"} \\ \hline +Non-matches: & \li{"3rats"} & \li{"err*r"} & \li{"sq(x)"} & \li{"sleep()"} & \li{" x"} +\end{tabular} -For reference, I used about ten lines to define \li{function_pattern_string} and used statements of the form \li{intermediate_pattern_string = r"(my regular expression here)".format()} four times. -\end{warn} -\label{prob:match_function_definition} +\centering +\begin{tabular}{c|lllll} +Matches: & \li{"max=4.2"} & \li{"string= ''"} & \li{"num_guesses"} \\ \hline +Non-matches: & \li{"300"} & \li{"is_4=(value==4)"} & \li{"pattern = r'^one|two fish\$'"} +\end{tabular} \end{problem} -\end{comment} + \section*{Manipulating Text with Regular Expressions} % ====================== @@ -503,24 +320,6 @@ \section*{Manipulating Text with Regular Expressions} % ====================== \caption{Methods of regular expression pattern objects.} \end{table} -\begin{lstlisting} -# Find words that start with 'cat'. ->>> expr = re.<>(r"\bcat\w*") # \b is the shortcut for a word boundary. - ->>> target = "Let's catch some catfish for the cat" ->>> bool(expr.search(target)) # Check to see if there is a match. -<> - ->>> expr.findall(target) # Get all matching substrings. -<<['catch' 'catfish', 'cat']>> - ->>> expr.sub("DOG", target) # Substitute 'DOG' for the matches. -<<"Let's DOG some DOG for the DOG">> - ->>> expr.split(target) # Split the target by the matches. -<<["Let's ", ' some ', ' for the ', '']>> -\end{lstlisting} - Some substitutions require remembering part of the text that the regular expression matches. Groups are useful here: each group in the regular expression can be represented in the substitution string by \li{\\n}, where $n$ is an integer (starting at 1) specifying which group to use. @@ -533,7 +332,6 @@ \section*{Manipulating Text with Regular Expressions} % ====================== <<"Let's atchclay some atfishclay for the atclay">> \end{lstlisting} -\begin{info} % Greedy repetition, check with findall(). The repetition operators \li{?}, \li{+}, \li{*}, and \li{\{m,n\}} are \emph{greedy}, meaning that they match the largest string possible. On the other hand, the operators \li{??}, \li{+?}, \li{*?}, and \li{\{m,n\}?} are \emph{non-greedy}, meaning they match the smallest strings possible. This is very often the desired behavior for a regular expression. @@ -551,7 +349,7 @@ \section*{Manipulating Text with Regular Expressions} % ====================== >>> nongreedy.findall(target) <<['', '', '']>> # Each <> set is an individual match. \end{lstlisting} -\end{info} + Finally, there are a few customizations that make searching larger texts manageable. Each of these \emph{flags} can be used as keyword arguments to \li{re.<>()}. @@ -567,31 +365,29 @@ \section*{Manipulating Text with Regular Expressions} % ====================== \caption{Regular expression flags.} \end{table} +A benefit of using \li{'^'} and \li{'\$'} is that they allow you to search across multiple lines. +For example, how would we match \li{"World"} in the string \mbox{\li{"Hello\\nWorld"}}? +Using \li{re.MULTILINE} in the \li{re.search} function will allow us to match at the beginning of each new line, instead of just the beginning of the string. +The following shows how to implement multiline searching: + \begin{lstlisting} -# Match any line with 3 consecutive 'a' characters somewhere. ->>> pattern = re.<>("^.*a{3}.*$", re.MULTILINE) # Search each line. ->>> pattern.findall(""" -This is aaan example. -This is not an example. -Actually, it's an example, but it doesn't match. -This example does maaatch though.""") -<<['This is aaan example.', 'This example does maaatch though.']>> - -# Match anything instance of 'cat', ignoring case. ->>> catfinder = re.compile("cat", re.IGNORECASE) ->>> catfinder.findall("cat CAT cAt TAC ctacATT") -<<['cat', 'CAT', 'cAt', 'cAT']>> +>>>pattern1 = re<<.compile>>("^W") +>>>pattern2 = re<<.compile>>("^W", re.MULTILINE) +>>>bool(pattern1.search("Hello\nWorld")) +<> +>>>bool(pattern2.search("Hello\nWorld")) +<> \end{lstlisting} \begin{problem} A Python \emph{block} is composed of several lines of code with the same indentation level. Blocks are delimited by key words and expressions, followed by a colon. Possible key words are \li{if}, \li{elif}, \li{else}, \li{for}, \li{while}, \li{try}, \li{except}, \li{finally}, \li{with}, \li{def}, and \li{class}. -Some of these keywords require an expression of some sort to follow before the colon (\li{if}, \li{elif}, \li{for}, etc.), some require no expressions to follow before the colon (\li{else}, \li{finally}), and \li{except} may or may not have an expression following before the colon. +Some of these keywords require an expression to precede the colon (\li{if}, \li{elif}, \li{for}, etc. +Some require no expressions to precede the colon (\li{else}, \li{finally}), and \li{except} may or may not have an expression before the colon. Write a function that accepts a string of Python code and uses regular expressions to place colons in the appropriate spots. -You may assume that every colon is missing in the input string. -See the following for an example. +Assume that every colon is missing in the input string. \begin{lstlisting} """ @@ -650,18 +446,17 @@ \section*{Manipulating Text with Regular Expressions} % ====================== \end{lstlisting} \end{problem} -\begin{comment} \newpage \section*{Additional Material} % ============================================== \section*{Regular Expressions in the Unix Shell} -As we have seen thus far, regular expressions are very useful when we want to match patterns. +As we have seen,, regular expressions are very useful when we want to match patterns. Regular expressions can be used when matching patterns in the Unix Shell. Though there are many Unix commands that take advantage of regular expressions, we will focus on \li{grep} and \li{awk}. \subsection*{Regular Expressions and grep} -Recall that \li{grep} is used to match patterns in files or output. +Recall from Lab 1 that \li{grep} is used to match patterns in files or output. It turns out we can use regular expressions to define the pattern we wish to match. In general, we use the following syntax: @@ -677,19 +472,26 @@ \subsection*{Regular Expressions and grep} \end{lstlisting} %$ - \subsection*{Regular Expressions and awk} -As in Lab 2, we will be using \li{awk} to format output. By incorporating regular expressions, \li{awk} becomes much more robust. Before GUI spreedsheet programs like Microsoft Excel, \li{awk} was commonly used to visualize and query data from a file. +By incorporating regular expressions, the \li{awk} command becomes much more robust. +Before GUI spreedsheet programs like Microsoft Excel, \li{awk} was commonly used to visualize and query data from a file. -Including \li{if} statements inside \li{awk} commands gives us the ability to perform actions on lines that match a given pattern. The following example prints the filenames of all files that are owned by \li{freddy}. +Including \li{if} statements inside \li{awk} commands gives us the ability to perform actions on lines that match a given pattern. +The following example prints the filenames of all files that are owned by \li{freddy}. \begin{lstlisting} $ ls -l | awk ' {if ($3 ~ /freddy/) print $9} ' \end{lstlisting} %$ -Because there is a lot going on in this command, we will break it down piece-by-piece. The output of \li{ls -l} is getting piped to \li{awk}. Then we have an \li{if} statement. The syntax here means if the condition inside the parenthesis holds, print field $9$ (the field with the filename). The condition is where we use regular expressions. The \li{\~} checks to see if the contents of field $3$ (the field with the username) matches the regular expression found inside the forward slashes. To clarify, \li{freddy} is the regular expression in this example and the expression must be surrounded by forward slashes. +Because there is a lot going on in this command, we will break it down piece-by-piece. +The output of \li{ls -l} is getting piped to \li{awk}. Then we have an \li{if} statement. +The syntax here means if the condition inside the parenthesis holds, print field $9$ (the field with the filename). The condition is where we use regular expressions. +The \li{\~} checks to see if the contents of field $3$ (the field with the username) matches the regular expression found inside the forward slashes. +To clarify, \li{freddy} is the regular expression in this example and the expression must be surrounded by forward slashes. -Consider a similar example. In this example, we will list the names of the directories inside the current directory. (This replicates the behavior of the Unix command \li{ls -d */}) +Consider a similar example. +In this example, we will list the names of the directories inside the current directory. +(This replicates the behavior of the Unix command \li{ls -d */}) \begin{lstlisting} $ ls -l | awk ' {if ($1 ~ /^d/) print $9} ' @@ -702,23 +504,4 @@ \subsection*{Regular Expressions and awk} Some of the definitions for character classes we used earlier in this lab will not work in the Unix Shell. For example, \li{\\w} and \li{\\d} are not defined. Instead of \li{\\w}, use \li{[[:alnum:]]}. Instead of \li{\\d}, use \li{[[:digit:]]}. For a complete list of similar character classes, search the internet for \emph{POSIX Character Classes} or \emph{Bracket Character Classes.} \end{warn} -\begin{problem} -You have been given a list of transactions from a fictional start-up company. In the \li{transactions.txt} file, each line represents a transaction. Transactions are represented as follows: -\begin{lstlisting} -# Notice the semicolons delimiting the fields. Also, notice that in between the last and first name, that is a comma, not a semicolon. -;;,; -\end{lstlisting} - -Using this set of transactions, produce the following information using regular expressions and the given command: -\begin{itemize} - \item Using \li{grep}, print all transactions by either Nicholas Ross or Zoey Ross. - \item Using \li{awk}, print a sorted list of the names of individuls that bought item $3298$. - \item Using \li{awk}, print a sorted list of items purchased between June 13 and June 15 of 2014 (inclusive). -\end{itemize} -These queries can be produced using one command each. -\end{problem} - -We encourage the interested reader to research more about how regular expressions can be used with \li{sed}. - % TODO: deterministic finite state automata. -\end{comment} diff --git a/PythonEssentials.tex b/PythonEssentials.tex index 935356d..24f3f04 100644 --- a/PythonEssentials.tex +++ b/PythonEssentials.tex @@ -53,6 +53,7 @@ \part{Labs} \part{Appendices} % Relevant Appendices --------------------------------------- \begin{appendices} +\subimport{./Appendices/Setup/}{SetupStudent} \subimport{./Appendices/Installation/}{Installation} \subimport{./Appendices/NumpyVisualGuide/}{NumpyVisualGuide} % \subimport{./Appendices/MatplotlibCustomization/}{MatplotlibCustomization} % TODO diff --git a/Volume1.tex b/Volume1.tex index 0284a1f..cd286b8 100644 --- a/Volume1.tex +++ b/Volume1.tex @@ -64,6 +64,7 @@ \part{Labs} % Volume I Labs --------------------------------------------------- \part{Appendices} % Relevant Appendices --------------------------------------- \begin{appendices} \subimport{./Appendices/Setup/}{SetupStudent} +\subimport{./Appendices/Installation/}{Installation} \subimport{./Appendices/NumpyVisualGuide/}{NumpyVisualGuide} % \subimport{./Appendices/MatplotlibCustomization/}{MatplotlibCustomization} % TODO % \subimport{./Appendices/ScipyStats/}{ScipyStats} % TODO diff --git a/Volume2.tex b/Volume2.tex index 1f7372c..7c6c943 100644 --- a/Volume2.tex +++ b/Volume2.tex @@ -65,6 +65,7 @@ \part{Labs} \part{Appendices} \begin{appendices} \subimport{./Appendices/Setup/}{SetupStudent} +\subimport{./Appendices/Installation/}{Installation} \subimport{./Appendices/NumpyVisualGuide/}{NumpyVisualGuide} % \subimport{./Appendices/MatplotlibCustomization/}{MatplotlibCustomization} % TODO: \subimport{./Appendices/ScipyStats/}{ScipyStats} diff --git a/Volume3.tex b/Volume3.tex index ca4a434..d5ca12b 100644 --- a/Volume3.tex +++ b/Volume3.tex @@ -61,6 +61,7 @@ \part{Labs} % ----------------------------------------------------------------- \begin{appendices} \subimport{./Appendices/Setup/}{SetupStudent} +\subimport{./Appendices/Installation/}{Installation} \subimport{./Appendices/NumpyVisualGuide/}{NumpyVisualGuide} % \subimport{./Appendices/MatplotlibCustomization/}{MatplotlibCustomization} % TODO: \subimport{./Appendices/ScipyStats/}{ScipyStats} diff --git a/Volume4.tex b/Volume4.tex index 48b324b..4cc3a0a 100644 --- a/Volume4.tex +++ b/Volume4.tex @@ -75,6 +75,7 @@ \part{Labs} \begin{appendices} \subimport{./Appendices/Setup/}{SetupStudent} +\subimport{./Appendices/Installation/}{Installation} \subimport{./Appendices/NumpyVisualGuide/}{NumpyVisualGuide} % \subimport{./Appendices/MatplotlibCustomization/}{MatplotlibCustomization} \end{appendices} diff --git a/docs/DataScienceEssentials.pdf b/docs/DataScienceEssentials.pdf index 9d66d60..8981d95 100644 Binary files a/docs/DataScienceEssentials.pdf and b/docs/DataScienceEssentials.pdf differ diff --git a/docs/PythonEssentials.pdf b/docs/PythonEssentials.pdf index bd23d8b..36055f5 100644 Binary files a/docs/PythonEssentials.pdf and b/docs/PythonEssentials.pdf differ diff --git a/docs/Volume1.pdf b/docs/Volume1.pdf index 35a6076..8f99d7d 100644 Binary files a/docs/Volume1.pdf and b/docs/Volume1.pdf differ diff --git a/docs/Volume2.pdf b/docs/Volume2.pdf index c210852..2f2418a 100644 Binary files a/docs/Volume2.pdf and b/docs/Volume2.pdf differ diff --git a/docs/Volume3.pdf b/docs/Volume3.pdf index bd9722e..87ff6a4 100644 Binary files a/docs/Volume3.pdf and b/docs/Volume3.pdf differ diff --git a/docs/Volume4.pdf b/docs/Volume4.pdf index e31b64e..15b07fb 100644 Binary files a/docs/Volume4.pdf and b/docs/Volume4.pdf differ