diff --git a/Makefile b/Makefile index 55b4d89..99facbd 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ -SUBDIR+= basics SUBDIR+= probability +SUBDIR+= shannon-entropy +SUBDIR+= applications INCLUDE_MAKEFILES=makefiles include ${INCLUDE_MAKEFILES}/subdir.mk diff --git a/applications/.gitignore b/applications/.gitignore new file mode 100644 index 0000000..116ecc4 --- /dev/null +++ b/applications/.gitignore @@ -0,0 +1,6 @@ +applications.log.gnuplot +applications.log.table +applications.pdf +encryptic.png +password_reuse.png +password_strength.png diff --git a/basics/Makefile b/applications/Makefile similarity index 69% rename from basics/Makefile rename to applications/Makefile index 9b50168..310affa 100644 --- a/basics/Makefile +++ b/applications/Makefile @@ -1,15 +1,13 @@ .PHONY: all -all: basics.pdf +all: applications.pdf -LATEXFLAGS+= -shell-escape -TEX_OUTDIR= . - -SRC+= basics.tex abstract.tex basics.bib +SRC+= applications.tex preamble.tex applications.bib +SRC+= abstract.tex contents.tex SRC+= hhcount.sty FIGURES+= collusion.png FIGURES+= password_strength.png password_reuse.png encryptic.png -basics.pdf: ${SRC} ${FIGURES} +applications.pdf: ${SRC} ${FIGURES} password_strength.png: wget -O $@ http://imgs.xkcd.com/comics/password_strength.png @@ -28,8 +26,8 @@ hhcount.sty hhutils0.sty: .PHONY: clean clean: - ${RM} basics.pdf - ${RM} basics.bbl basics.log.gnuplot basics.log.table basics.nav basics.run.xml basics.snm basics.vrb + ${RM} applications.pdf + ${RM} applications.bbl applications.log.gnuplot applications.log.table applications.nav applications.run.xml applications.snm applications.vrb .PHONY: distclean diff --git a/applications/README.md b/applications/README.md new file mode 100644 index 0000000..2e87c69 --- /dev/null +++ b/applications/README.md @@ -0,0 +1,2 @@ +- "Applications of information theory" (MP4 H.264 AAC) + \[[HTTPS](https://ver.miun.se/courses/security/dasak/infotheory-applications.mp4)\] diff --git a/applications/abstract.tex b/applications/abstract.tex new file mode 100644 index 0000000..6bb8a86 --- /dev/null +++ b/applications/abstract.tex @@ -0,0 +1,17 @@ +\emph{Summary:} +There are various uses for information theory. +In this session we will explore some of them. + +\emph{Intended learning outcomes:} +After this session you should be able +\begin{itemize} + \item to \emph{understand} how Shannon entropy can be applied in various + areas. +\end{itemize} + +\emph{Reading:} +You should read on the use of entropy to estimate anonymity: +\citetitle{Eckersley2010hui}~\cite{Eckersley2010hui}. +This is then utilised in the text \citetitle{Bosk2013gl}~\cite{Bosk2013gl} (in +Swedish), and \citetitle{Komanduri2011opa}~\cite{Komanduri2011opa} which treats +passwords. diff --git a/basics/basics.bib b/applications/applications.bib similarity index 100% rename from basics/basics.bib rename to applications/applications.bib diff --git a/applications/applications.tex b/applications/applications.tex new file mode 100644 index 0000000..90dd7d9 --- /dev/null +++ b/applications/applications.tex @@ -0,0 +1,80 @@ +%\documentclass[handout]{beamer} +\documentclass{beamer} +\input{preamble.tex} + +\usepackage{xparse} +\ProvideDocumentEnvironment{exercise}{o}{% + \setbeamercolor{block body}{bg=yellow!30,fg=black} + \setbeamercolor{block title}{bg=yellow,fg=black} + \IfValueTF{#1}{% + \begin{block}{Exercise: #1} + }{% + \begin{block}{Exercise} + } +}{% + \end{block} +} +\ProvideDocumentEnvironment{remark}{o}{% + \IfValueTF{#1}{% + \begin{alertblock}{Remark: #1} + }{% + \begin{alertblock}{Remark} + } +}{% + \end{alertblock} +} +\DeclareMathOperator{\powerset}{\mathcal{P}} +\DeclareMathOperator{\p}{\mathcal{P}} +\let\P\p +\DeclareMathOperator{\C}{\mathcal{C}} +\DeclareMathOperator{\K}{\mathcal{K}} +\DeclareMathOperator{\E}{\mathcal{E}} +\DeclareMathOperator{\D}{\mathcal{D}} + +\DeclareMathOperator{\N}{\mathbb{N}} +\DeclareMathOperator{\Z}{\mathbb{Z}} +\DeclareMathOperator{\R}{\mathbb{R}} + +\let\stoch\mathbf{} + +\DeclareMathOperator{\xor}{\oplus} + +\renewcommand{\qedsymbol}{Q.E.D.} + +\mode{% + \usetheme{Berlin} + \setbeamercovered{transparent} +} +\setbeamertemplate{footline}{\insertframenumber} + +\title{% + Applications of information theory +} +\author{% + Daniel Bosk +} +\institute[MIUN IKS]{% + Department of Information and Communication Systems,\\ + Mid Sweden University, Sundsvall. +} +\date{\today} + +\AtBeginSection[]{% + \begin{frame} + \tableofcontents[currentsection] + \end{frame} +} + +\begin{document} + +\begin{frame} + \titlepage{} +\end{frame} + +\begin{frame} + \tableofcontents +\end{frame} + +\mode{\input{contents.tex}} + +\end{document} diff --git a/basics/collusion.png b/applications/collusion.png similarity index 100% rename from basics/collusion.png rename to applications/collusion.png diff --git a/applications/contents.tex b/applications/contents.tex new file mode 100644 index 0000000..c338ddf --- /dev/null +++ b/applications/contents.tex @@ -0,0 +1,361 @@ +% Since this a solution template for a generic talk, very little can +% be said about how it should be structured. However, the talk length +% of between 15min and 45min and the theme suggest that you stick to +% the following rules: + +% - Exactly two or three sections (other than the summary). +% - At *most* three subsections per section. +% - Talk about 30s to 2min per frame. So there should be between about +% 15 and 30 frames, all told. + + +\section{Applications} + +\subsection{Information density and redundancy} + +\begin{frame} + \begin{definition} + \begin{itemize} + \item Natural language \(L\). + \item Stochastic variable \(\stoch P^n_L\) of strings of length \(n\). + \item (Alphabet \(P_L\).) + \item Entropy of \(L\) defined as + \begin{align*} + H_L = \lim_{n\to \infty}\frac{H(\stoch P^n_L)}{n}. + \end{align*} + \item Redundancy in \(L\) is + \begin{align*} + R_L = 1 - \frac{H_L}{\log |P_L|}. + \end{align*} + \end{itemize} + \end{definition} +\end{frame} + +\begin{frame} + \begin{remark} + Meaning we have \(H_L\) bits per character in \(L\). + \end{remark} + + \begin{example}[\cite{Shannon1948amt}] + \begin{itemize} + \item Entropy of 1--1.5 bits per character in English. + \item Redundancy of approximately \(1 - \frac{1.25}{\log 26} \approx + 0.73\). + \end{itemize} + \end{example} + +\end{frame} + +\begin{frame} + \begin{example}[\cite{Shannon1948amt}] + Two-dimensional cross-word puzzles requires redundancy of approximately + \(0.5\). + \end{example} + + \begin{example} + \begin{itemize} + \item Redundancy of \enquote{SMS languages} is lower than for + \enquote{non-SMS languages}. + + \item Compare \enquote{också} and \enquote{oxå}. + + \end{itemize} + \end{example} + + \begin{remark} + \begin{itemize} + \item Lower redundancy is more space-efficient. + \item Incurs more errors. + \end{itemize} + \end{remark} +\end{frame} + +%\begin{frame} +% \begin{itemize} +% \item Detta säger också att vi kan uppskatta entropin för en given +% Markovprocess. +% +% \item Shannon modellerade språket som en Markovprocess i sin artikel +% \cite{Shannon1948amt}. +% +% \item Vi kan även beräkna entropin för ett givet tillstånd i en +% Markovprocess genom betingad entropi. +% +% \end{itemize} +%\end{frame} + +\subsection{Passwords} + +\begin{frame} + \begin{block}{Idea~\cite{Komanduri2011opa}} + \begin{itemize} + \item Look at different aspects of passwords individually, then + summarize. + \item Can use \(H(x_1, x_2, \ldots, x_n) \leq H(x_1) + H(x_2) + \cdots + + H(x_n)\). + \item This allows us to reason about bounds. + \end{itemize} + \end{block} +\end{frame} + +\begin{frame} + \begin{example} + \begin{itemize} + \item We can look at properties such as: + \begin{itemize} + \item length, + \item number of and placement of character classes, + \item the actual characters, + \item \dots + \end{itemize} + \end{itemize} + \end{example} + + \pause{} + + \begin{remark} + \begin{itemize} + \item These are \emph{not independent}. + \item The sum will be an \emph{upper bound}. + \end{itemize} + \end{remark} +\end{frame} + +\begin{frame} + \begin{remark} + \begin{itemize} + \item With an upper bound we know it's not possible to do better. + \item With an average we know how well most users will do. + \item With a lower bound we have a guarantee --- not possible! + \end{itemize} + \end{remark} +\end{frame} + +\begin{frame} + \begin{remark} + \begin{itemize} + \item If a password policy yields low entropy, it implies it's bad. + \item If a password policy yields high entropy, it \emph{doesn't} imply + that it's good. + \end{itemize} + \end{remark} + + \pause + + \begin{exercise} + Why? + \end{exercise} +\end{frame} + +\begin{frame} + \begin{figure} + \includegraphics[height=0.7\textheight]{password_strength.png} + \caption{xkcd's strip on password strength. + Picture: xkcd~\cite{xkcd936}.} + \end{figure} +\end{frame} + +%\begin{frame}{En förklaring av xkcd} +% \begin{itemize} +% \item Vi har 1 miljon engelska ord: ger \(\log 10^6 \approx 20\) bitar +% entropi. +% (xkcd använder 16 bitar, vilket ger ca 70\,000 ord, alla ord i engelskan +% är inte vanliga.) +% +% \item Vi kan ha inledande versal: ger 1 bit entropi. +% +% \item Vi har några vanliga substitutioner: uppskattningsvis 10 stycken, +% d.v.s.~3 bitar entropi. +% +% \item Vi har specialtecken (ej substitution): uppskattningsvis 4 bitar +% entropi. +% +% \item Vi har siffror: \(\log 10\approx 3\). +% +% \item Ordningen på specialtecknet och siffran: ger 1 bit entropi. +% +% \item Totalt 32 bitar entropi: +% \begin{itemize} +% \item Tar minst 50 dagar med 1\,000 gissningar per sekund. +% \item Tar strax över en timme med 1\,000\,000 gissningar per sekund. +% \end{itemize} +% +% \end{itemize} +%\end{frame} + +\begin{frame} + \begin{example}[Standard password] + \begin{itemize} + \item We have + \begin{itemize} + \item 26 alphabetic characters, + \item 10 numbers, + \item 10 special characters (approximately). + \end{itemize} + + \item This yields \(\log( 2\times 26 + 10 + 10 ) = \log 72 \approx + \SI{6}{\bit}\) per password character. + + \item A 10-character \emph{uniformly randomly} generated password + contains \SI{60}{\bit}. + \end{itemize} + \end{example} + + \pause{} + + \begin{remark} + What happens when we require two upper and two lower-case characters, two + numbers must be included? + \end{remark} +\end{frame} + +\begin{frame} + \begin{example}[Four-word passphrase] + \begin{itemize} + \item We have 125\,000 words in the standard Swedish dictionary. + \item This yields \(\log 125\,000\approx \SI{17}{\bit}\) per word. + \item A four-word \emph{uniformly randomly} generated passphrase contains + \SI{68}{\bit}. + \end{itemize} + \end{example} +\end{frame} + +\begin{frame} + \begin{example}[Random sentence] + \begin{itemize} + \item We estimated the entropy per character in a language. + \item It was approximately \(\SI{1.25}{\bit}\) for English. + \item A 20-character \emph{uniformly randomly} generated sentence would + yield \SI{25}{\bit}. + \end{itemize} + \end{example} +\end{frame} + +\begin{frame} + \begin{remark} + \begin{itemize} + \item All these require uniform randomness. + \item Humans are bad at remembering random things. + \item Thus they will choose non-randomly. + \item The entropy will thus be (possibly much) lower. + \end{itemize} + \end{remark} +\end{frame} + +\subsection{Research about human chosen passwords} + +\begin{frame} + \begin{example}[\citetitle{Bonneau2012lpo}~\cite{Bonneau2012lpo}] + \begin{itemize} + \item Investigates how linguistics affect the choice of multi-word + passphrases. + + \item Users don't choose them randomly, prefer adapted to natural + language. + + \item \enquote{correct horse battery staple} is preferred to + \enquote{horse correct battery staple} since the first is more + grammatically correct. + \end{itemize} + \end{example} +\end{frame} + +\begin{frame} + \begin{example}[\citetitle{Kuo2006hso}~\cite{Kuo2006hso}] + \begin{itemize} + \item Studied how users creates easy-to-remember passwords. + + \item Also investigated the strength of phrase-based passwords. + + \item E.g.\ Google's example \enquote{To be or not to be, that is the + question}\footnote{% + URL\@: + \protect\url{http://www.lightbluetouchpaper.org/2011/11/08/want-to-create-a-really-strong-password-dont-ask-google/}. + } which results in \enquote{2bon2btitq}. + + \item This particular password has apparently been used by many \dots + \end{itemize} + \end{example} +\end{frame} + +\begin{frame} + \begin{remark} + \begin{itemize} + \item There is a PhD thesis on the topic of guessing passwords: + \fullcite{GuessingHumanChosenSecrets}. + \item There is even a conference dedicated to passwords: PasswordsCon. + \end{itemize} + \end{remark} +\end{frame} + +\subsection{Identifying information} + +\begin{frame} + \begin{example} + Do we get more information from zodiac signs or birthdays? + \begin{align*} + -\sum_{\mathclap{\text{zodiacs}}} \frac{1}{12} \log\frac{1}{12} &= \log 12 + \approx 3.58 \\ + &< -\sum_{\mathclap{\text{days of year}}} \frac{1}{365} \log\frac{1}{365} + = \log 365 \approx 8.51. + \end{align*} + \end{example} +\end{frame} + +\begin{frame} + \begin{exercise} + How much information do we need to uniquely identify an individual? + \end{exercise} +\end{frame} + +\begin{frame} + \begin{example} + \begin{itemize} + \item Sometime during 2011 there were \(n = 6\,973\,738\,433\)\footnote{% + According to the World Bank. + } people on earth. + + \item To give everyone a unique identifier we need \(\log n\approx + 32.7\approx 33\) bits of information. + \end{itemize} + \end{example} +\end{frame} + +\begin{frame} + \begin{block}{Identifying information in browsers} + \begin{itemize} + \item Electronic Frontier Foundation (EFF) studied~\cite{Eckersley2010hui} + how much information a web-browser shares. + + \item You can try your browser in \url{http://panopticlick.eff.org/}. + \end{itemize} + \end{block} + + \pause{} + + \begin{example}[My browser] + \begin{itemize} + \item My Firefox-browser with all addons gave 21.45 bits of entropy. + + \item Then the number of tested users were 2\,860\,696. + \end{itemize} + \end{example} +\end{frame} + +\begin{frame} + \begin{figure} + \includegraphics[height=0.7\textheight]{collusion.png} + \caption{Screenshot from Collusion (now Lightbeam) for Firefox. + Map over all pages that track me using this information.} + \end{figure} +\end{frame} + + +%%%%%%%%%%%%%%%%%%%%%% + +\subsection*{References} +\begin{frame}[allowframebreaks] + \small + \printbibliography{} +\end{frame} + diff --git a/applications/preamble.tex b/applications/preamble.tex new file mode 100644 index 0000000..dec504b --- /dev/null +++ b/applications/preamble.tex @@ -0,0 +1,23 @@ +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage[swedish,british]{babel} +\usepackage{url} +\usepackage{graphicx} +\usepackage{color} +\usepackage{subfig} +\usepackage{multicol} +\usepackage{amssymb,amsmath,amsthm} +\usepackage{booktabs} +%\usepackage[squaren,binary]{SIunits} +\usepackage[binary-units]{siunitx} +\usepackage[strict]{csquotes} +\usepackage{cleveref} +\usepackage{hhcount} +\usepackage{pgfplots} + +\usepackage{mathtools} + +\setbeamertemplate{bibliography item}[text] +\usepackage[natbib,style=alphabetic,maxbibnames=99]{biblatex} +\addbibresource{applications.bib} + diff --git a/basics/.gitignore b/basics/.gitignore deleted file mode 100644 index f8a4f0a..0000000 --- a/basics/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -basics.log.gnuplot -basics.log.table -basics.pdf -encryptic.png -password_reuse.png -password_strength.png diff --git a/basics/abstract.tex b/basics/abstract.tex deleted file mode 100644 index 2513055..0000000 --- a/basics/abstract.tex +++ /dev/null @@ -1,13 +0,0 @@ -The area of Information Theory was founded in 1948 by Claude Shannon. -It concerns information, e.g.\ how much information is contained in certain -data. -Equivalently, it is also a measure of uncertainty in information, and has thus -plenty of application in security and cryptography. - -The concept of entropy, the main part of information theory, is treated in -a few short texts: \citetitle{Eckersley2010apo}~\cite{Eckersley2010apo} and -applied in \citetitle{Eckersley2010hui}~\cite{Eckersley2010hui}, but also in -\citetitle{Ueltschi2013se}~\cite{Ueltschi2013se}. -This is then utilised in the text \citetitle{Bosk2013gl}~\cite{Bosk2013gl} (in -Swedish), and \citetitle{Komanduri2011opa}~\cite{Komanduri2011opa} which treats -passwords. diff --git a/shannon-entropy/.gitignore b/shannon-entropy/.gitignore new file mode 100644 index 0000000..e27d6d8 --- /dev/null +++ b/shannon-entropy/.gitignore @@ -0,0 +1,6 @@ +shannon-entropy.log.gnuplot +shannon-entropy.log.table +shannon-entropy.pdf +encryptic.png +password_reuse.png +password_strength.png diff --git a/shannon-entropy/Makefile b/shannon-entropy/Makefile new file mode 100644 index 0000000..0754a58 --- /dev/null +++ b/shannon-entropy/Makefile @@ -0,0 +1,34 @@ +.PHONY: all +all: shannon-entropy.pdf + +LATEXFLAGS+= -shell-escape +TEX_OUTDIR= . + +SRC+= shannon-entropy.tex preamble.tex shannon-entropy.bib +SRC+= abstract.tex contents.tex +SRC+= hhcount.sty + +shannon-entropy.pdf: ${SRC} + +hhcount.sty: hhutils0.sty +hhcount.sty hhutils0.sty: + wget -O $@ http://tug.ctan.org/tex-archive/usergrps/uktug/baskervi/5_5/$@ + touch $@ + + +.PHONY: clean +clean: + ${RM} shannon-entropy.pdf + ${RM} shannon-entropy.bbl shannon-entropy.log.gnuplot shannon-entropy.log.table shannon-entropy.nav shannon-entropy.run.xml shannon-entropy.snm shannon-entropy.vrb + + +.PHONY: distclean +distclean: + ${RM} password_strength.png + ${RM} password_reuse.png + ${RM} encryptic.png + ${RM} hhcount.sty hhutils0.sty + + +INCLUDE_MAKEFILES=../makefiles +include ${INCLUDE_MAKEFILES}/tex.mk diff --git a/shannon-entropy/README.md b/shannon-entropy/README.md new file mode 100644 index 0000000..c8da595 --- /dev/null +++ b/shannon-entropy/README.md @@ -0,0 +1,2 @@ +- "Shannon entropy" (MP4 H.264 AAC) + \[[HTTPS](https://ver.miun.se/courses/security/dasak/shannon-entropy.mp4)\] diff --git a/shannon-entropy/abstract.tex b/shannon-entropy/abstract.tex new file mode 100644 index 0000000..e3a4d1b --- /dev/null +++ b/shannon-entropy/abstract.tex @@ -0,0 +1,19 @@ +\emph{Summary:} +The area of Information Theory was founded in 1948 by Claude Shannon. +It is a mathematical theory to reason about how much information is contained +in certain data. +Equivalently, it is also a measure of uncertainty in information, and has thus +plenty of application in security and cryptography. +This learning session covers the basic concept: Shannon entropy. + +\emph{Intended learning outcomes:} +After the session you should be able +\begin{itemize} + \item to \emph{apply} Shannon entropy in basic situations. +\end{itemize} + +\emph{Reading:} +The concept of Shannon entropy, the main part of information theory, is treated +in a few short texts: +\citetitle{Eckersley2010apo}~\cite{Eckersley2010apo} and +\citetitle{Ueltschi2013se}~\cite{Ueltschi2013se}. diff --git a/shannon-entropy/collusion.png b/shannon-entropy/collusion.png new file mode 100644 index 0000000..e85f526 Binary files /dev/null and b/shannon-entropy/collusion.png differ diff --git a/basics/basics.tex b/shannon-entropy/contents.tex similarity index 55% rename from basics/basics.tex rename to shannon-entropy/contents.tex index c49c5fb..3eadb8e 100644 --- a/basics/basics.tex +++ b/shannon-entropy/contents.tex @@ -1,102 +1,3 @@ -%\documentclass[handout]{beamer} -\documentclass{beamer} -\usepackage[utf8]{inputenc} -\usepackage[T1]{fontenc} -\usepackage[swedish,british]{babel} -\usepackage{url} -\usepackage{graphicx} -\usepackage{color} -\usepackage{subfig} -\usepackage{multicol} -\usepackage{amssymb,amsmath,amsthm} -\usepackage{booktabs} -%\usepackage[squaren,binary]{SIunits} -\usepackage[binary-units]{siunitx} -\usepackage[strict]{csquotes} -\usepackage{cleveref} -\usepackage{hhcount} -\usepackage{pgfplots} - -\usepackage{mathtools} - -\setbeamertemplate{bibliography item}[text] -\usepackage[natbib,style=alphabetic,maxbibnames=99]{biblatex} -\addbibresource{basics.bib} - -\usepackage{xparse} -\ProvideDocumentEnvironment{exercise}{o}{% - \setbeamercolor{block body}{bg=yellow!30,fg=black} - \setbeamercolor{block title}{bg=yellow,fg=black} - \IfValueTF{#1}{% - \begin{block}{Exercise: #1} - }{% - \begin{block}{Exercise} - } -}{% - \end{block} -} -\ProvideDocumentEnvironment{remark}{o}{% - \IfValueTF{#1}{% - \begin{alertblock}{Remark: #1} - }{% - \begin{alertblock}{Remark} - } -}{% - \end{alertblock} -} -\DeclareMathOperator{\powerset}{\mathcal{P}} -\DeclareMathOperator{\p}{\mathcal{P}} -\let\P\p -\DeclareMathOperator{\C}{\mathcal{C}} -\DeclareMathOperator{\K}{\mathcal{K}} -\DeclareMathOperator{\E}{\mathcal{E}} -\DeclareMathOperator{\D}{\mathcal{D}} - -\DeclareMathOperator{\N}{\mathbb{N}} -\DeclareMathOperator{\Z}{\mathbb{Z}} -\DeclareMathOperator{\R}{\mathbb{R}} - -\let\stoch\mathbf{} - -\DeclareMathOperator{\xor}{\oplus} - -\renewcommand{\qedsymbol}{Q.E.D.} - -\mode{% - \usetheme{Berlin} - \setbeamercovered{transparent} -} -\setbeamertemplate{footline}{\insertframenumber} - -\title{% - Applied Information Theory -} -\author{% - Daniel Bosk -} -\institute[MIUN IKS]{% - Department of Information and Communication Systems,\\ - Mid Sweden University, Sundsvall. -} -\date{\today} - -\AtBeginSection[]{% - \begin{frame} - \tableofcontents[currentsection] - \end{frame} -} - -\begin{document} - -\begin{frame} - \titlepage{} -\end{frame} - -\begin{frame} - \tableofcontents -\end{frame} - - % Since this a solution template for a generic talk, very little can % be said about how it should be structured. However, the talk length % of between 15min and 45min and the theme suggest that you stick to @@ -108,7 +9,7 @@ % 15 and 30 frames, all told. -\section{Introduction} +\section{Shannon entropy} \subsection{History} @@ -154,8 +55,6 @@ \subsection{History} \end{frame} -\section{Shannon entropy} - \subsection{Definition of Shannon Entropy} \begin{frame} @@ -545,79 +444,6 @@ \subsection{Conditional entropy} % \end{theorem} %\end{frame} -\subsection{Information density and redundancy} - -\begin{frame} - \begin{definition} - \begin{itemize} - \item Natural language \(L\). - \item Stochastic variable \(\stoch P^n_L\) of strings of length \(n\). - \item (Alphabet \(P_L\).) - \item Entropy of \(L\) defined as - \begin{align*} - H_L = \lim_{n\to \infty}\frac{H(\stoch P^n_L)}{n}. - \end{align*} - \item Redundancy in \(L\) is - \begin{align*} - R_L = 1 - \frac{H_L}{\log |P_L|}. - \end{align*} - \end{itemize} - \end{definition} -\end{frame} - -\begin{frame} - \begin{remark} - Meaning we have \(H_L\) bits per character in \(L\). - \end{remark} - - \begin{example}[\cite{Shannon1948amt}] - \begin{itemize} - \item Entropy of 1--1.5 bits per character in English. - \item Redundancy of approximately \(1 - \frac{1.25}{\log 26} \approx - 0.73\). - \end{itemize} - \end{example} - -\end{frame} - -\begin{frame} - \begin{example}[\cite{Shannon1948amt}] - Two-dimensional cross-word puzzles requires redundancy of approximately - \(0.5\). - \end{example} - - \begin{example} - \begin{itemize} - \item Redundancy of \enquote{SMS languages} is lower than for - \enquote{non-SMS languages}. - - \item Compare \enquote{också} and \enquote{oxå}. - - \end{itemize} - \end{example} - - \begin{remark} - \begin{itemize} - \item Lower redundancy is more space-efficient. - \item Incurs more errors. - \end{itemize} - \end{remark} -\end{frame} - -%\begin{frame} -% \begin{itemize} -% \item Detta säger också att vi kan uppskatta entropin för en given -% Markovprocess. -% -% \item Shannon modellerade språket som en Markovprocess i sin artikel -% \cite{Shannon1948amt}. -% -% \item Vi kan även beräkna entropin för ett givet tillstånd i en -% Markovprocess genom betingad entropi. -% -% \end{itemize} -%\end{frame} - \subsection{Information gain} \begin{frame} @@ -675,275 +501,6 @@ \subsection{Information gain} \end{frame} -\section[Applications]{Application in security} - -\subsection{Passwords} - -\begin{frame} - \begin{block}{Idea~\cite{Komanduri2011opa}} - \begin{itemize} - \item Look at different aspects of passwords individually, then - summarize. - \item Can use \(H(x_1, x_2, \ldots, x_n) \leq H(x_1) + H(x_2) + \cdots - + H(x_n)\). - \item This allows us to reason about bounds. - \end{itemize} - \end{block} -\end{frame} - -\begin{frame} - \begin{example} - \begin{itemize} - \item We can look at properties such as: - \begin{itemize} - \item length, - \item number of and placement of character classes, - \item the actual characters, - \item \dots - \end{itemize} - \end{itemize} - \end{example} - - \pause{} - - \begin{remark} - \begin{itemize} - \item These are \emph{not independent}. - \item The sum will be an \emph{upper bound}. - \end{itemize} - \end{remark} -\end{frame} - -\begin{frame} - \begin{remark} - \begin{itemize} - \item With an upper bound we know it's not possible to do better. - \item With an average we know how well most users will do. - \item With a lower bound we have a guarantee --- not possible! - \end{itemize} - \end{remark} -\end{frame} - -\begin{frame} - \begin{remark} - \begin{itemize} - \item If a password policy yields low entropy, it implies it's bad. - \item If a password policy yields high entropy, it \emph{doesn't} imply - that it's good. - \end{itemize} - \end{remark} - - \pause - - \begin{exercise} - Why? - \end{exercise} -\end{frame} - -\begin{frame} - \begin{figure} - \includegraphics[height=0.7\textheight]{password_strength.png} - \caption{xkcd's strip on password strength. - Picture: xkcd~\cite{xkcd936}.} - \end{figure} -\end{frame} - -%\begin{frame}{En förklaring av xkcd} -% \begin{itemize} -% \item Vi har 1 miljon engelska ord: ger \(\log 10^6 \approx 20\) bitar -% entropi. -% (xkcd använder 16 bitar, vilket ger ca 70\,000 ord, alla ord i engelskan -% är inte vanliga.) -% -% \item Vi kan ha inledande versal: ger 1 bit entropi. -% -% \item Vi har några vanliga substitutioner: uppskattningsvis 10 stycken, -% d.v.s.~3 bitar entropi. -% -% \item Vi har specialtecken (ej substitution): uppskattningsvis 4 bitar -% entropi. -% -% \item Vi har siffror: \(\log 10\approx 3\). -% -% \item Ordningen på specialtecknet och siffran: ger 1 bit entropi. -% -% \item Totalt 32 bitar entropi: -% \begin{itemize} -% \item Tar minst 50 dagar med 1\,000 gissningar per sekund. -% \item Tar strax över en timme med 1\,000\,000 gissningar per sekund. -% \end{itemize} -% -% \end{itemize} -%\end{frame} - -\begin{frame} - \begin{example}[Standard password] - \begin{itemize} - \item We have - \begin{itemize} - \item 26 alphabetic characters, - \item 10 numbers, - \item 10 special characters (approximately). - \end{itemize} - - \item This yields \(\log( 2\times 26 + 10 + 10 ) = \log 72 \approx - \SI{6}{\bit}\) per password character. - - \item A 10-character \emph{uniformly randomly} generated password - contains \SI{60}{\bit}. - \end{itemize} - \end{example} - - \pause{} - - \begin{remark} - What happens when we require two upper and two lower-case characters, two - numbers must be included? - \end{remark} -\end{frame} - -\begin{frame} - \begin{example}[Four-word passphrase] - \begin{itemize} - \item We have 125\,000 words in the standard Swedish dictionary. - \item This yields \(\log 125\,000\approx \SI{17}{\bit}\) per word. - \item A four-word \emph{uniformly randomly} generated passphrase contains - \SI{68}{\bit}. - \end{itemize} - \end{example} -\end{frame} - -\begin{frame} - \begin{example}[Random sentence] - \begin{itemize} - \item We estimated the entropy per character in a language. - \item It was approximately \(\SI{1.25}{\bit}\) for English. - \item A 20-character \emph{uniformly randomly} generated sentence would - yield \SI{25}{\bit}. - \end{itemize} - \end{example} -\end{frame} - -\begin{frame} - \begin{remark} - \begin{itemize} - \item All these require uniform randomness. - \item Humans are bad at remembering random things. - \item Thus they will choose non-randomly. - \item The entropy will thus be (possibly much) lower. - \end{itemize} - \end{remark} -\end{frame} - -\subsection{Research about human chosen passwords} - -\begin{frame} - \begin{example}[\citetitle{Bonneau2012lpo}~\cite{Bonneau2012lpo}] - \begin{itemize} - \item Investigates how linguistics affect the choice of multi-word - passphrases. - - \item Users don't choose them randomly, prefer adapted to natural - language. - - \item \enquote{correct horse battery staple} is preferred to - \enquote{horse correct battery staple} since the first is more - grammatically correct. - \end{itemize} - \end{example} -\end{frame} - -\begin{frame} - \begin{example}[\citetitle{Kuo2006hso}~\cite{Kuo2006hso}] - \begin{itemize} - \item Studied how users creates easy-to-remember passwords. - - \item Also investigated the strength of phrase-based passwords. - - \item E.g.\ Google's example \enquote{To be or not to be, that is the - question}\footnote{% - URL\@: - \protect\url{http://www.lightbluetouchpaper.org/2011/11/08/want-to-create-a-really-strong-password-dont-ask-google/}. - } which results in \enquote{2bon2btitq}. - - \item This particular password has apparently been used by many \dots - \end{itemize} - \end{example} -\end{frame} - -\begin{frame} - \begin{remark} - \begin{itemize} - \item There is a PhD thesis on the topic of guessing passwords: - \fullcite{GuessingHumanChosenSecrets}. - \item There is even a conference dedicated to passwords: PasswordsCon. - \end{itemize} - \end{remark} -\end{frame} - -\subsection{Identifying information} - -\begin{frame} - \begin{example} - Do we get more information from zodiac signs or birthdays? - \begin{align*} - -\sum_{\mathclap{\text{zodiacs}}} \frac{1}{12} \log\frac{1}{12} &= \log 12 - \approx 3.58 \\ - &< -\sum_{\mathclap{\text{days of year}}} \frac{1}{365} \log\frac{1}{365} - = \log 365 \approx 8.51. - \end{align*} - \end{example} -\end{frame} - -\begin{frame} - \begin{exercise} - How much information do we need to uniquely identify an individual? - \end{exercise} -\end{frame} - -\begin{frame} - \begin{example} - \begin{itemize} - \item Sometime during 2011 there were \(n = 6\,973\,738\,433\)\footnote{% - According to the World Bank. - } people on earth. - - \item To give everyone a unique identifier we need \(\log n\approx - 32.7\approx 33\) bits of information. - \end{itemize} - \end{example} -\end{frame} - -\begin{frame} - \begin{block}{Identifying information in browsers} - \begin{itemize} - \item Electronic Frontier Foundation (EFF) studied~\cite{Eckersley2010hui} - how much information a web-browser shares. - - \item You can try your browser in \url{http://panopticlick.eff.org/}. - \end{itemize} - \end{block} - - \pause{} - - \begin{example}[My browser] - \begin{itemize} - \item My Firefox-browser with all addons gave 21.45 bits of entropy. - - \item Then the number of tested users were 2\,860\,696. - \end{itemize} - \end{example} -\end{frame} - -\begin{frame} - \begin{figure} - \includegraphics[height=0.7\textheight]{collusion.png} - \caption{Screenshot from Collusion (now Lightbeam) for Firefox. - Map over all pages that track me using this information.} - \end{figure} -\end{frame} - - %%%%%%%%%%%%%%%%%%%%%% \subsection*{References} @@ -952,4 +509,3 @@ \subsection*{References} \printbibliography{} \end{frame} -\end{document} diff --git a/shannon-entropy/preamble.tex b/shannon-entropy/preamble.tex new file mode 100644 index 0000000..ac9c38e --- /dev/null +++ b/shannon-entropy/preamble.tex @@ -0,0 +1,23 @@ +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage[swedish,british]{babel} +\usepackage{url} +\usepackage{graphicx} +\usepackage{color} +\usepackage{subfig} +\usepackage{multicol} +\usepackage{amssymb,amsmath,amsthm} +\usepackage{booktabs} +%\usepackage[squaren,binary]{SIunits} +\usepackage[binary-units]{siunitx} +\usepackage[strict]{csquotes} +\usepackage{cleveref} +\usepackage{hhcount} +\usepackage{pgfplots} + +\usepackage{mathtools} + +\setbeamertemplate{bibliography item}[text] +\usepackage[natbib,style=alphabetic,maxbibnames=99]{biblatex} +\addbibresource{shannon-entropy.bib} + diff --git a/shannon-entropy/shannon-entropy.bib b/shannon-entropy/shannon-entropy.bib new file mode 100644 index 0000000..f1efdaf --- /dev/null +++ b/shannon-entropy/shannon-entropy.bib @@ -0,0 +1,123 @@ +@misc{Eckersley2010apo, + author={Eckersley, Peter}, + title={A Primer on Information Theory and Privacy}, + year={2010}, + month={1}, + URL={https://www.eff.org/deeplinks/2010/01/primer-information-theory-and-privacy}, +} +@inproceedings{Eckersley2010hui, + author={Eckersley, Peter}, + title={How Unique Is Your Browser?}, + booktitle={Privacy Enhancing Technologies}, + pages={1--18}, + year={2010}, + organization={Springer}, + URL={https://panopticlick.eff.org/static/browser-uniqueness.pdf}, +} + +@article{Shannon1948amt, + author={Shannon, C. E.}, + title={A Mathematical Theory of Communication}, + journal={The Bell System Technical Journal}, + volume={27}, + pages={379--423, 623--656}, + month={7-8}, + year={1948}, +} +@unpublished{Ueltschi2013se, + author={Ueltschi, Daniel}, + title={Chapter 6: Shannon entropy}, + URL={http://www.ueltschi.org/teaching/chapShannon.pdf}, +} + +@unpublished{Bosk2013gl, + author={Bosk, Daniel}, + title={Grundl{\"a}ggande l{\"o}senordsanalys}, + year={2013}, + URL={http://ver.miun.se/courses/security/compendii/pwdanalysis.pdf}, +} + +@inproceedings{Komanduri2011opa, + author={Komanduri, Saranga and + Shay, Richard and + Kelley, Patrick Gage and + Mazurek, Michelle L. and + Bauer, Lujo and + Christin Nicolas and + Cranor, Lorrie Faith and + Egelman, Serge}, + title={Of passwords and people: + Measuring the effect of password-composition policies}, + booktitle={CHI}, + year={2011}, + URL={http://cups.cs.cmu.edu/rshay/pubs/passwords_and_people2011.pdf}, +} +@inproceedings{Komanduri2014can, + title={Can long passwords be secure and usable?}, + author={Shay, Richard and + Komanduri, Saranga and + Durity, Adam L and + Huh, Phillip Seyoung and + Mazurek, Michelle L and + Segreti, Sean M and + Ur, Blase and + Bauer, Lujo and + Christin, Nicolas and + Cranor, Lorrie Faith}, + booktitle={Proceedings of the 32nd annual ACM conference on Human factors in + computing systems}, + pages={2927--2936}, + year={2014}, + organization={ACM}, + URL={http://lorrie.cranor.org/pubs/longpass-chi2014.pdf}, +} + +@misc{xkcd936, + author={xkcd}, + title={Password Strength}, + URL={https://xkcd.com/936/}, +} +@misc{xkcd792, + author={xkcd}, + title={Password Reuse}, + URL={https://xkcd.com/792/}, +} +@misc{xkcd1286, + author={xkcd}, + title={Encryptic}, + URL={https://xkcd.com/1286/}, +} + +@inproceedings{Bonneau2012sog, + author={Bonneau, Joseph}, + title={The science of guessing: + analyzing an anonymized corpus of 70 million passwords}, + booktitle={{IEEE Symposium on Security and Privacy}}, + year={2012}, + URL={http://www.cl.cam.ac.uk/~jcb82/doc/B12-IEEESP-analyzing_70M_anonymized_passwords.pdf}, +} +@inproceedings{Bonneau2012lpo, + author={Bonneau, Joseph and Shutova, Ekaterina}, + title={Linguistic properties of multi-word passwords}, + booktitle={{USEC}}, + year={2012}, + URL={http://www.cl.cam.ac.uk/~jcb82/doc/BS12-USEC-passphrase_linguistics.pdf}, +} +@techreport{Kuo2006hso, + author={Kuo, Cynthia and Romanosky, Sasha and Cranor, Lorrie Faith}, + title={{Human Selection of Mnemonic Phrase-based Passwords}}, + institution={Institute of Software Research}, + number={36}, + year={2006}, + URL={http://repository.cmu.edu/isr/36/}, +} + +@TechReport{GuessingHumanChosenSecrets, + author = {Bonneau, Joseph}, + title = {{Guessing human-chosen secrets}}, + year = 2012, + month = may, + url = {http://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-819.pdf}, + institution = {University of Cambridge, Computer Laboratory}, + number = {UCAM-CL-TR-819} +} diff --git a/shannon-entropy/shannon-entropy.tex b/shannon-entropy/shannon-entropy.tex new file mode 100644 index 0000000..03e9e94 --- /dev/null +++ b/shannon-entropy/shannon-entropy.tex @@ -0,0 +1,80 @@ +%\documentclass[handout]{beamer} +\documentclass{beamer} +\input{preamble.tex} + +\usepackage{xparse} +\ProvideDocumentEnvironment{exercise}{o}{% + \setbeamercolor{block body}{bg=yellow!30,fg=black} + \setbeamercolor{block title}{bg=yellow,fg=black} + \IfValueTF{#1}{% + \begin{block}{Exercise: #1} + }{% + \begin{block}{Exercise} + } +}{% + \end{block} +} +\ProvideDocumentEnvironment{remark}{o}{% + \IfValueTF{#1}{% + \begin{alertblock}{Remark: #1} + }{% + \begin{alertblock}{Remark} + } +}{% + \end{alertblock} +} +\DeclareMathOperator{\powerset}{\mathcal{P}} +\DeclareMathOperator{\p}{\mathcal{P}} +\let\P\p +\DeclareMathOperator{\C}{\mathcal{C}} +\DeclareMathOperator{\K}{\mathcal{K}} +\DeclareMathOperator{\E}{\mathcal{E}} +\DeclareMathOperator{\D}{\mathcal{D}} + +\DeclareMathOperator{\N}{\mathbb{N}} +\DeclareMathOperator{\Z}{\mathbb{Z}} +\DeclareMathOperator{\R}{\mathbb{R}} + +\let\stoch\mathbf{} + +\DeclareMathOperator{\xor}{\oplus} + +\renewcommand{\qedsymbol}{Q.E.D.} + +\mode{% + \usetheme{Berlin} + \setbeamercovered{transparent} +} +\setbeamertemplate{footline}{\insertframenumber} + +\title{% + Shannon entropy +} +\author{% + Daniel Bosk +} +\institute[MIUN IKS]{% + Department of Information and Communication Systems,\\ + Mid Sweden University, Sundsvall. +} +\date{\today} + +\AtBeginSection[]{% + \begin{frame} + \tableofcontents[currentsection] + \end{frame} +} + +\begin{document} + +\begin{frame} + \titlepage{} +\end{frame} + +\begin{frame} + \tableofcontents +\end{frame} + +\mode{\input{contents.tex}} + +\end{document}