diff --git a/latex/splash2024/splash.tex b/latex/splash2024/splash.tex index c0a30fb6..d13a7304 100644 --- a/latex/splash2024/splash.tex +++ b/latex/splash2024/splash.tex @@ -309,7 +309,7 @@ \begin{wrapfigure}{r}{0.5\textwidth} \vspace{-0.3cm} \begin{center} - \input{nfa_cfg.tex} + \input{nfa_cfg} \end{center} \caption{NFA recognizing Levenshtein $L(\sigma: \Sigma^5, 3)$.}\label{fig:lev_nfa} \vspace{-0.5cm} diff --git a/latex/tidyparse/preamble.tex b/latex/tidyparse/preamble.tex index 8d6eaee3..b973f6f7 100644 --- a/latex/tidyparse/preamble.tex +++ b/latex/tidyparse/preamble.tex @@ -79,16 +79,6 @@ \DeclareRobustCommand{\hlred}[1]{{\sethlcolor{pink}\hl{#1}}} \usepackage{fontspec} -\setmonofont[Scale=0.8]{JetBrainsMono}[ - Contextuals={Alternate}, - Path=./font/, - Extension = .ttf, - UprightFont=*-Regular, - BoldFont=*-Bold, - ItalicFont=*-Italic, - BoldItalicFont=*-BoldItalic -] - \usepackage[skins,breakable,listings]{tcolorbox} \usepackage[dvipsnames, table]{xcolor} @@ -609,4 +599,73 @@ \draw [-to] (0pt,0pt) -- (6pt,0pt); \draw [-to] (0pt,0pt) -- (12pt,6pt); } -} \ No newline at end of file +} + +\usetikzlibrary{shapes.geometric, arrows} + +\tikzstyle{startstop} = [rectangle, rounded corners, +minimum width=3cm, +minimum height=1cm, +thick, +text centered, +draw=none, +] + +\tikzstyle{plain} = [ +rectangle, +rounded corners, +minimum width=3.5cm, +minimum height=1cm, +thick, +text centered, +draw=black +] + +\tikzstyle{io} = [trapezium, +trapezium stretches=true, % A later addition +thick, +trapezium left angle=70, +trapezium right angle=110, +minimum width=3cm, +minimum height=1cm, text centered, +draw=black, fill=blue!30] + +\tikzstyle{io2} = [trapezium, +trapezium stretches=true, % A later addition +thick, +trapezium left angle=70, +trapezium right angle=110, +minimum width=3cm, +minimum height=1cm, text centered, +draw=black, fill=black!30] + +\tikzstyle{process} = [rectangle, +minimum width=3.5cm, +minimum height=1cm, +thick, +text centered, +text width=4cm, +draw=black, +fill=orange!30] + +\tikzstyle{decision} = [diamond, +minimum width=2.5cm, +minimum height=0.5cm, +thick, +text centered, +draw=black, +fill=green!30] +\tikzstyle{arrow} = [->,thick] + +%\usetikzlibrary{external} +%\tikzexternalize[prefix=figures/] +\definecolor{green}{RGB}{0,128,0} +\definecolor{darkgray176}{RGB}{176,176,176} +\definecolor{darkviolet1270255}{RGB}{127,0,255} +\definecolor{deepskyblue3176236}{RGB}{3,176,236} +\definecolor{dodgerblue45123246}{RGB}{45,123,246} +\definecolor{lightgray204}{RGB}{204,204,204} +\definecolor{royalblue8762253}{RGB}{87,62,253} + +\usepackage{sankey} +\usepackage{wrapfig} \ No newline at end of file diff --git a/latex/tidyparse/presentation.pdf b/latex/tidyparse/presentation.pdf index f4bb4807..18160570 100644 Binary files a/latex/tidyparse/presentation.pdf and b/latex/tidyparse/presentation.pdf differ diff --git a/latex/tidyparse/presentation.tex b/latex/tidyparse/presentation.tex index 92a4867c..6b72af73 100644 --- a/latex/tidyparse/presentation.tex +++ b/latex/tidyparse/presentation.tex @@ -92,44 +92,6 @@ % \tableofcontents %\end{frame} -\begin{frame}[fragile]{Can you spot the error?} - \begin{center} - \begin{tabular}{|m{5.5cm}|m{5.5cm}|} - \hline \rule{0pt}{2.5ex}\textbf{Original code}\rule[-1ex]{0pt}{2ex} & \rule{0pt}{2.5ex}\textbf{Human repair}\rule[-1ex]{0pt}{2ex} \\\hline - \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] - - form sympy import * - x = Symbol('x', real=True) - x, re(x), im(x) - - \end{lstlisting} & \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] - - \end{lstlisting} \\\hline - \end{tabular} - \end{center} -\end{frame} - -\begin{frame}[fragile]{Can you spot the error?} - \begin{center} - \begin{tabular}{|m{5.5cm}|m{5.5cm}|} - \hline \rule{0pt}{2.5ex}\textbf{Original code}\rule[-1ex]{0pt}{2ex} & \rule{0pt}{2.5ex}\textbf{Human repair}\rule[-1ex]{0pt}{2ex} \\\hline - \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] - - !\hlorange{form}! sympy import * - x = Symbol('x', real=True) - x, re(x), im(x) - - \end{lstlisting} & \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] - - !\hlorange{from}! sympy import * - x = Symbol('x', real=True) - x, re(x), im(x) - - \end{lstlisting} \\\hline - \end{tabular} - \end{center} -\end{frame} - \begin{frame}[fragile]{Can you spot the error?} \begin{center} \begin{tabular}{|m{5.5cm}|m{5.5cm}|} @@ -175,9 +137,11 @@ \hline \rule{0pt}{2.5ex}\textbf{Original code}\rule[-1ex]{0pt}{2ex} & \rule{0pt}{2.5ex}\textbf{Human repair}\rule[-1ex]{0pt}{2ex} \\\hline \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] - my_list = [] - for i in range(10); - my_list.append(2*i) + def average(values): + if values == (1,2,3): + return (1+2+3)/3 + else if values == (-3,2): + return (-3+2+8-1)/4 \end{lstlisting} & \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] @@ -192,15 +156,19 @@ \hline \rule{0pt}{2.5ex}\textbf{Original code}\rule[-1ex]{0pt}{2ex} & \rule{0pt}{2.5ex}\textbf{Human repair}\rule[-1ex]{0pt}{2ex} \\\hline \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] - my_list = [] - for i in range(10)!\hlorange{;}! - my_list.append(2*i) + def average(values): + if values == (1,2,3): + return (1+2+3)/3 + !\hlorange{else}! !\hlred{if}! values == (-3,2): + return (-3+2+8-1)/4 \end{lstlisting} & \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] - my_list = [] - for i in range(10)!\hlorange{:}! - my_list.append(2*i) + def average(values): + if values == (1,2,3): + return (1+2+3)/3 + !\hlorange{elif}! values == (-3,2): + return (-3+2+8-1)/4 \end{lstlisting} \\\hline \end{tabular} @@ -245,45 +213,118 @@ \end{center} \end{frame} -\begin{frame}[fragile]{Can you spot the error?} - \begin{center} - \begin{tabular}{|m{5.5cm}|m{5.5cm}|} - \hline \rule{0pt}{2.5ex}\textbf{Original code}\rule[-1ex]{0pt}{2ex} & \rule{0pt}{2.5ex}\textbf{Human repair}\rule[-1ex]{0pt}{2ex} \\\hline - \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] +%\begin{frame}[fragile]{Can you spot the error?} +% \begin{center} +% \begin{tabular}{|m{5.5cm}|m{5.5cm}|} +% \hline \rule{0pt}{2.5ex}\textbf{Original code}\rule[-1ex]{0pt}{2ex} & \rule{0pt}{2.5ex}\textbf{Human repair}\rule[-1ex]{0pt}{2ex} \\\hline +% \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] +% +% try: +% something() +% catch AttributeError: +% pass +% +% \end{lstlisting} & \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] +% +% \end{lstlisting} \\\hline +% \end{tabular} +% \end{center} +%\end{frame} +% +%\begin{frame}[fragile]{Can you spot the error?} +% \begin{center} +% \begin{tabular}{|m{5.5cm}|m{5.5cm}|} +% \hline \rule{0pt}{2.5ex}\textbf{Original code}\rule[-1ex]{0pt}{2ex} & \rule{0pt}{2.5ex}\textbf{Human repair}\rule[-1ex]{0pt}{2ex} \\\hline +% \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] +% +% try: +% something() +% !\hlorange{catch}! AttributeError: +% pass +% +% \end{lstlisting} & \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] +% +% try: +% something() +% !\hlorange{except}! AttributeError: +% pass +% +% \end{lstlisting} \\\hline +% \end{tabular} +% \end{center} +%\end{frame} - try: - something() - catch AttributeError: - pass +\begin{frame}[t,fragile]{How many repairs could there possibly be?} + Consider the following Python snippet, which contains a small syntax error:\\ - \end{lstlisting} & \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] + \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] + def prepend(i, k, L=[]) + n and [prepend(i - 1, k, [b] + L) for b in range(k)] + \end{lstlisting} +\end{frame} - \end{lstlisting} \\\hline - \end{tabular} - \end{center} +\begin{frame}[t,fragile]{How many repairs could there possibly be?} + Consider the following Python snippet, which contains a small syntax error:\\ + + \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] + def prepend(i, k, L=[]) + n and [prepend(i - 1, k, [b] + L) for b in range(k)] + \end{lstlisting} + + It can be fixed by appending a colon after the function signature, yielding:\\ + + \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] + def prepend(i, k, L=[])!\hlgreen{:}! + n and [prepend(i - 1, k, [b] + L) for b in range(k)] + \end{lstlisting} \end{frame} -\begin{frame}[fragile]{Can you spot the error?} - \begin{center} - \begin{tabular}{|m{5.5cm}|m{5.5cm}|} - \hline \rule{0pt}{2.5ex}\textbf{Original code}\rule[-1ex]{0pt}{2ex} & \rule{0pt}{2.5ex}\textbf{Human repair}\rule[-1ex]{0pt}{2ex} \\\hline - \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] +\begin{frame}[t,fragile]{How many repairs could there possibly be?} + Consider the following Python snippet, which contains a small syntax error:\\ - try: - something() - !\hlorange{catch}! AttributeError: - pass + \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] + def prepend(i, k, L=[]) + n and [prepend(i - 1, k, [b] + L) for b in range(k)] + \end{lstlisting} - \end{lstlisting} & \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] + It can be fixed by appending a colon after the function signature, yielding:\\ - try: - something() - !\hlorange{except}! AttributeError: - pass + \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] + def prepend(i, k, L=[])!\hlgreen{:}! + n and [prepend(i - 1, k, [b] + L) for b in range(k)] + \end{lstlisting} - \end{lstlisting} \\\hline - \end{tabular} - \end{center} + \vspace{0.5cm} + + \normalsize Let us consider a slightly more ambiguous error: \footnotesize{\texttt{v = df.iloc(5:, 2:)}}. \normalsize Assuming an alphabet of just a hundred lexical tokens, this statement has millions of two-token edits, yet only six are accepted by the Python parser: +\end{frame} + +\begin{frame}[t,fragile]{How many repairs could there possibly be?} + Consider the following Python snippet, which contains a small syntax error:\\ + + \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] + def prepend(i, k, L=[]) + n and [prepend(i - 1, k, [b] + L) for b in range(k)] + \end{lstlisting} + + It can be fixed by appending a colon after the function signature, yielding:\\ + + \begin{lstlisting}[escapechar=!, basicstyle=\linespread{1.3}\ttfamily\footnotesize] + def prepend(i, k, L=[])!\hlgreen{:}! + n and [prepend(i - 1, k, [b] + L) for b in range(k)] + \end{lstlisting} + + \vspace{0.5cm} + + \normalsize Let us consider a slightly more ambiguous error: \footnotesize{\texttt{v = df.iloc(5:, 2:)}}. \normalsize Assuming an alphabet of just a hundred lexical tokens, this statement has millions of two-token edits, yet only six are accepted by the Python parser: + + \scriptsize + \begin{figure}[h!] + \noindent\begin{tabular}{@{}l@{\hspace{10pt}}l@{\hspace{10pt}}l@{}} + (1) \texttt{v = df.iloc(5\hlred{:}, 2\hlorange{,})} & (3) \texttt{v = df.iloc(5\hlgreen{[}:, 2:\hlgreen{]})} & (5) \texttt{v = df.iloc\hlorange{[}5:, 2:\hlorange{]}} \\\\ + (2) \texttt{v = df.iloc(5\hlorange{)}, 2\hlorange{(})} & (4) \texttt{v = df.iloc(5\hlred{:}, 2\hlred{:})} & (6) \texttt{v = df.iloc(5\hlgreen{[}:, 2\hlorange{]})} \\ + \end{tabular} + \end{figure} \end{frame} \begin{frame}[fragile]{On the virtues of pragmatism} @@ -302,7 +343,7 @@ \begin{columns}[T] \begin{column}{0.9\textwidth} \begin{itemize} - \item Pioneered in the 19th century by Pierce, James, Dewey, et al. + \item Pioneered in the 19th century by Peirce, James, Dewey, et al. \item Wittgenstein was a pragmatist, early work on language games. \item Pragmatism is a philosophy of language that emphasizes the role of intent in human communication. \item Language is a tool for communication, \\not just a arbitrary set of rules. @@ -320,53 +361,74 @@ \end{columns} \end{frame} -\begin{frame}[fragile]{Common sources of syntax errors} +%\begin{frame}[fragile]{Common sources of syntax errors} +% \begin{itemize} +% \item Reading impairments (e.g., dyslexia, dysgraphia) +% \item Motor impairments (e.g., tremors, Parkinson's) +% \item Speech impediments (e.g., stuttering, apraxia, Tourette's) +% \item Visual impairments (e.g., poor eyesight, blindness) +% \item Language barriers (e.g., foreign and non-native speakers) +% \item Inexperience (e.g., novice programmers) +% \item Distraction (e.g., multitasking, fatigue, stress) +% \item Time pressure (e.g., deadlines, interview coding) +% \item Inattention (e.g., typographic mistakes, boredom, apathy) +% \item Lack of feedback (e.g., no syntax highlighting or IDE) +% \end{itemize} +%\end{frame} + +\begin{frame}{From Error-Correcting Codes to Correcting Coding Errors} \begin{itemize} - \item Reading impairments (e.g., dyslexia, dysgraphia) - \item Motor impairments (e.g., tremors, Parkinson's) - \item Speech impediments (e.g., stuttering, apraxia, Tourette's) - \item Visual impairments (e.g., poor eyesight, blindness) - \item Language barriers (e.g., foreign and non-native speakers) - \item Inexperience (e.g., novice programmers) - \item Distraction (e.g., multitasking, fatigue, stress) - \item Time pressure (e.g., deadlines, interview coding) - \item Inattention (e.g., typographic mistakes, boredom, apathy) - \item Lack of feedback (e.g., no syntax highlighting or IDE) - \end{itemize} + \item Error-correcting codes are a well-studied topic in information theory used to detect and correct errors in data transmission. + \item Introduces parity bits to detect and correct transmission errors assuming a certain noise model (e.g., Hamming distance). + \item Like ECCs, we also assume a certain noise model (Levenshtein distance) and error tolerance (n-lexical tokens). + \item Instead of injecting parity bits, we use the grammar and mutual information between tokens to detect and correct errors. + \item Unlike ECCs, we do not assume a unique solution, but a set of admissible solutions ranked by statistical likliehood. + \end{itemize} + \setlength{\epigraphwidth}{0.97\textwidth} + \epigraph{``\textit{Damn it, if the machine can detect an error, why can't it\\\phantom{``}locate the position of the error and correct it?'}''}{Richard Hamming, 1915-1998} \end{frame} \begin{frame}[fragile]{Syntax repair as a language game} - \begin{itemize} \item Imagine a game between two players, \textit{Editor} and \textit{Author}. \item They both see the same grammar, $\mathcal{G}$ and invalid string $\err\sigma \notin \mathcal{L}(\mathcal{G})$. % Both players move simultaneously after a short period of deliberation. As soon as Player A repairs $\err\sigma$, Player B must move immediately. \item Author moves by modifying $\err\sigma$ to produce a valid string, $\sigma \in \mathcal{L}(\mathcal{G})$. - \item Editor moves continuously, sampling a set $\tilde{\bm\sigma} \in \mathcal{P}\big(\mathcal{L}(\mathcal{G})\big)$. + \item Editor moves continuously, sampling a set $\tilde{\bm\sigma} \subseteq \mathcal{L}(\mathcal{G})$. \item As soon as Author repairs $\err\sigma$, the turn immediately ends. \item Neither player sees the other's move(s) before making their own. \item If Editor anticipates Author's move, i.e., $\sigma \in \tilde{\bm\sigma}$, they both win. \item If Author surprises Editor with a valid move, i.e., $\sigma \notin \tilde{\bm\sigma}$, Author wins. \item We may consider a refinement where Editor wins in proportion to the time taken to anticipate Author's move. \end{itemize} - \end{frame} -\begin{frame}{From Error-Correcting Codes to Correcting Coding Errors} - \begin{itemize} - \item Error-correcting codes are a well-studied topic in information theory used to detect and correct errors in data transmission. - \item Introduces parity bits to detect and correct transmission errors assuming a certain noise model (e.g., Hamming distance). - \item Like ECCs, we also assume a certain noise model (Levenshtein distance) and error tolerance (n-lexical tokens). - \item Instead of injecting parity bits, we use the grammar and mutual information between tokens to detect and correct errors. - \item Unlike ECCs, we do not assume a unique solution, but a set of admissible solutions ranked by statistical likliehood. - \end{itemize} - \setlength{\epigraphwidth}{0.97\textwidth} - \epigraph{``\textit{Damn it, if the machine can detect an error, why can't it\\\phantom{``}locate the position of the error and correct it?'}''}{Richard Hamming, 1915-1998} + +\begin{frame}[fragile]{Problem Statement: Validity and naturalness} + Syntax repair can be treated as a language intersection problem between a context-free language (CFL) and a regular language. + + \begin{definition}[Reachable edits] + Given a CFL, $\ell$, and an invalid string, $\err{\sigma}: \ell^\complement$, find every valid string reachable within $d$ edits of $\err{\sigma}$, i.e., letting $\Delta$ be the Levenshtein metric and $L(\err\sigma, d) = \{\sigma' \mid \Delta(\err{\sigma}, \sigma') \leq d\}$ be the edit ball, we seek $A = L(\err\sigma, d) \cap \ell$. + \end{definition} + + \begin{definition}[Ranked repair] + Given a finite language $A = L(\err\sigma, d) \cap \ell$ and a probabilistic language model $\text{P}_\theta: \Sigma^* \rightarrow [0, 1] \subset \mathbb{R}$, the ranked repair problem is to find the top-$k$ maximum likelihood repairs under the language model. That is, + \begin{equation} + R(A, P_\theta) = \argmax_{\bm{\sigma} \subseteq A, |\bm{\sigma}| \leq k} \sum_{\sigma \in \bm{\sigma}}\text{P}_\theta(\sigma) + \end{equation} + \end{definition} +% To solve this problem, we will first pose a simpler problem that only considers localized edits, then turn our attention back to BCFLR. +% +% \begin{definition}[Porous completion] +% Let $\underline\Sigma \coloneqq \Sigma \cup \{\hole\}$, where \hole denotes a hole. We denote $\sqsubseteq: \Sigma^n \times \underline\Sigma^n$ as the relation $\{\langle\sigma', \sigma\rangle \mid \sigma_i \in \Sigma \implies \sigma_i' = \sigma_i\}$ and the set of all inhabitants $\{\sigma' \mid \sigma' \sqsubseteq \sigma\}$ as $\text{H}(\sigma)$. Given a \textit{porous string}, $\sigma: \underline\Sigma^*$ we seek all syntactically admissible inhabitants, i.e., $A(\sigma)\coloneqq\text{H}(\sigma)\cap\ell$. +% \end{definition} \end{frame} -\begin{frame}[fragile]{Our Contribution} - Helps novice programmers fix syntax errors in source code. We do so by solving the \textbf{Realtime Bounded Levenshtein Reachability Problem}:\vspace{10pt} +\begin{frame}[fragile]{Problem Statement: Temporal constraints} + Find every syntactically admissible edit $\{\sigma' \in \ell \mid \Delta(\err{\sigma}, \sigma') \leq d\}$, ranked by a probability metric $P_\theta$, and return them in a reasonable amount of time. - Given a context-free language $\ell: \mathcal{L}(\mathcal{G})$ and an invalid string $\err{\sigma}: \bar\ell$, find every syntactically admissible edit $\tilde{\sigma}$ satisfying $\{\tilde{\sigma} \in \ell \mid \Delta(\err{\sigma}, \ell) < r\}$, ranked by a probability metric $\Delta$, under hard realtime constraints. + \begin{definition}[Linear convergence]\label{def:linear-convergence} + Given a finite CFL, $\ell$, we want a generating function, $\bm{\varphi}: \mathbb{N}_{<|\ell|} \rightarrow 2^\ell$, that converges linearly in expectation, i.e., $\mathbb{E}_{i \in [1, n]}|\bm{\varphi}(i)| \propto n$. + \end{definition} \begin{center} \resizebox{0.42\textwidth}{!}{ @@ -401,258 +463,391 @@ \node [above,yshift=2.1cm] at (e) {$\mathcal{L}(\mathcal{G}')$}; \end{tikzpicture} } - \scalebox{7}{\large$^+$\Huge\fontspec{Arial Unicode MS}^^^^231b} + \scalebox{7}{\large$^+$\raisebox{0.2ex}{\huge\fontspec{Arial Unicode MS}^^^^231b}} \end{center} \textbf{Natural language:} - \textit{Rapidly finds syntactically valid edits within a small neighborhood, ranked by tokenwise similarity and statistical likliehood.} + \textit{Retrieve as many syntactically valid repairs as possible within a small neighborhood and time frame, ranked by naturalness.} \end{frame} -\begin{frame}[fragile]{Bounded Levenshtein Reachability} - Syntax repair can be treated as a language intersection problem between a context-free language (CFL) and a regular language. +%\begin{frame}[fragile]{Ranked repair under realtime constraints} +% $A(\sigma)$ is often a very large-cardinality set, so we want a procedure which prioritizes likely repairs first, without exhaustive enumeration. Specifically, +% \begin{definition}[Ranked repair] +% Given a finite language $\ell_\cap = L(\err\sigma, d) \cap \ell$ and a probabilistic language model $P_\theta: \Sigma^* \rightarrow [0, 1] \subset \mathbb{R}$, the ranked repair problem is to find the top-$k$ repairs by likelihood under the language model. That is, +% \begin{equation} +% R(\ell_\cap, P_\theta) \coloneqq \argmax_{\{\bm{\sigma} \mid \bm{\sigma} \subseteq \ell_\cap, |\bm{\sigma}| \leq k\}} \sum_{\sigma \in \bm{\sigma}}\text{P}(\sigma\mid\err\sigma, \theta) +% \end{equation} +% % On average, across all $G, \sigma$ $\hat{R}$ should approximate $R$. +% We want a procedure $\hat{R}$, minimizing $\mathbb{E}_{G, \sigma}\big[D_{\text{KL}}(\hat{R} \parallel R)\big]$ and total latency. +% \end{definition} +% +% Since $R$ is intractable in general, we want a procedure that approximates it for a representative sampling of natural context-free grammars and strings, i.e., real-world programming languages and source code snippets. +%\end{frame} - \begin{definition}[Bounded Levenshtein-CFL reachability] - Given a CFL $\ell$ and an invalid string $\err{\sigma}: \bar\ell$, the BCFLR problem is to find every valid string reachable within $d$ edits of $\err{\sigma}$, i.e., we seek $L(\err\sigma, d) \cap \ell$ where $L(\err\sigma, d) \coloneqq \{\sigma \mid \Delta(\err{\sigma}, \sigma) \leq d\}$ and $\Delta$ is the Levenshtein metric. - \end{definition} +%\begin{frame}[fragile]{From CFL Reachability to Real World Program Repair} +% To fix real code, we needed to overcome a variety of interesting challenges:\vspace{10pt} +% +% \begin{itemize} +% \item \textbf{Syntax mismatch}: The syntax of real-world programming languages does not exactly correspond to the theory of formal languages. +% \item \textbf{Source code $\approx$ PL}: Most of the time, source code in the wild is incomplete or only loosely approximates a programming language. +% \item \textbf{Responsiveness}: The usefulness of synthetic repairs is inversely proportional to the amount of time required to generate them. +% \item \textbf{Edit generation}: How do we generate edits that are (1) syntactically admissible (2) statistically plausible and (3) semantically meaningful? +% \item \textbf{Evaluation}: Big code and version control is too coarse-grained, contains irrelevant edits, not representative of small errors/fixes. +% \end{itemize} +%\end{frame} - To solve this problem, we will first pose a simpler problem that only considers localized edits, then turn our attention back to BCFLR. +%\begin{frame}[fragile]{Design Criteria} +% \begin{itemize} +% \item First and foremost, the framework must be \textbf{sound} and \textbf{complete}. It must (1) accept arbitrary CFGs, and (2) recognize all and only syntactically valid strings within a fixed edit distance of a given string. +% \item Second, we strongly prefer repairs to be \textbf{plausible} and \textbf{consistent}, i.e., likely to be expressed in practice and consistent with the author's intent and surrounding context (e.g., stylistically, semantically) +% \item Third, recommendations must be \textbf{efficient} and \textbf{responsive}. We must be able to recognize valid strings in subcubic time, and generate repairs in (at worst) polynomial time. These criteria are necessary to provide real-time feedback whilst the user is typing. +% \item Fourth, the framework must be \textbf{robust} and \textbf{scalable}. Ceteris paribus, the framework should be robust to multiple errors, handle large grammars and scale linearly with additional processors. +% \end{itemize} +%\end{frame} - \begin{definition}[Porous completion] - Let $\underline\Sigma \coloneqq \Sigma \cup \{\hole\}$, where \hole denotes a hole. We denote $\sqsubseteq: \Sigma^n \times \underline\Sigma^n$ as the relation $\{\langle\sigma', \sigma\rangle \mid \sigma_i \in \Sigma \implies \sigma_i' = \sigma_i\}$ and the set of all inhabitants $\{\sigma' \mid \sigma' \sqsubseteq \sigma\}$ as $\text{H}(\sigma)$. Given a \textit{porous string}, $\sigma: \underline\Sigma^*$ we seek all syntactically admissible inhabitants, i.e., $A(\sigma)\coloneqq\text{H}(\sigma)\cap\ell$. - \end{definition} -\end{frame} +\begin{frame}[fragile]{High-level architecture overview} + \vspace{0.4cm} + \begin{figure}[h!] + \vspace{-1cm} + \begin{center} + \resizebox{0.8\textwidth}{!}{ + \begin{tikzpicture}[node distance=5cm] + \node (start) [io] {Broken code}; + \node (node1) [plain, right of=start] {\phantom{...}\textbf{Language intersection}\phantom{...}}; + \node (gram1) [io2, above of=node1, yshift=-3cm] {Syntax}; + \node (node2) [plain, right of=node1] {\textbf{Repair extraction}}; +% \node (ptree) [io, above of=node2, yshift=-3cm] {$\mathbb{T}_2$}; + \node (node3) [plain, right of=node2] {\textbf{Reranking}}; + \node (ngram) [io2, above of=node3, yshift=-3cm] {Markov chain}; + \node (node4) [io, right of=node3] {Repairs}; + \draw [arrow] (start) -- (node1); + \draw [arrow] (gram1) -- (node1); + \draw [arrow] (node1) -- (node2); + \draw [arrow] (node2) -- (node3); + \draw [arrow] (node3) -- (node4); + \draw [arrow] (ngram) -- (node3); +% \draw [arrow] (ptree) -- (node2); + \end{tikzpicture} + } + \end{center} + \end{figure} -\begin{frame}[fragile]{Ranked repair under realtime constraints} - $A(\sigma)$ is often a very large-cardinality set, so we want a procedure which prioritizes likely repairs first, without exhaustive enumeration. Specifically, - \begin{definition}[Ranked repair] - Given a finite language $\ell_\cap = L(\err\sigma, d) \cap \ell$ and a probabilistic language model $P_\theta: \Sigma^* \rightarrow [0, 1] \subset \mathbb{R}$, the ranked repair problem is to find the top-$k$ repairs by likelihood under the language model. That is, - \begin{equation} - R(\ell_\cap, P_\theta) \coloneqq \argmax_{\{\bm{\sigma} \mid \bm{\sigma} \subseteq \ell_\cap, |\bm{\sigma}| \leq k\}} \sum_{\sigma \in \bm{\sigma}}\text{P}(\sigma\mid\err\sigma, \theta) - \end{equation} - % On average, across all $G, \sigma$ $\hat{R}$ should approximate $R$. - We want a procedure $\hat{R}$, minimizing $\mathbb{E}_{G, \sigma}\big[D_{\text{KL}}(\hat{R} \parallel R)\big]$ and total latency. - \end{definition} + \vspace{0.4cm} + + \begin{wrapfigure}{r}{0.4\textwidth} +%\begin{figure}[h!] + \vspace{-0.7cm} + \begin{center} + \resizebox{0.39\textwidth}{!}{ + \begin{tikzpicture}[node distance=2cm] + \node (start) [startstop, draw=none]; + \node (pro1) [process, below of=start, yshift=-0.3cm] {$G_\cap \leftarrow G\cap\Delta(\err\sigma, d)$}; + \node [above=0.07cm of pro1] {}; + \node (pcfg) [io2, left of=pro1, xshift=-3cm] {[P]CFG}; + \node [below=0.07cm of pcfg] {}; + \node [below=2.3cm of pcfg, xshift=0.4cm] {\Large\textbf{Language intersection}}; + \node (lnfa) [io, right of=pro1, xshift=3cm] {L-NFA}; + \node [above=0.07cm of lnfa, xshift=1cm] {}; + + \node (code) [io, right of=start,xshift=3cm] {Code}; + \node (synt) [io2, left of=start,xshift=-3cm] {Syntax}; + + \node (dec1) [decision, below of=pro1, yshift=-0.5cm] {$[G_{\cap} = \varnothing]$}; + + \node (pro2b) [process, right of=dec1, xshift=3cm] {Increase radius, $d$}; + + \draw[thick,dotted, rounded corners] ($(pcfg.north west)+(-1.9,0.7)$) rectangle ($(pro2b.south east)+(0.3,-0.6)$); + + \node (const) [process, below of=dec1, yshift=-1.1cm] {Construct $\mathbb{T}_2$ from $G_\cap'$}; + \node [above=0.07cm of const, xshift=1.5cm] {}; + + \node (dec2) [decision, below of=const, yshift=-0.5cm] {$|\mathcal{L}(G_\cap)|$}; + + \node (samp1) [process, left of=dec2, xshift=-3cm] {Enumerate $\sigma' \in \mathcal{L}(G_\cap)$}; + \node [above=0.07cm of samp1] {}; + \node [above=2.3cm of samp1,xshift=-0.2cm] {\Large\textbf{Repair extraction}}; + \node (samp2) [process, right of=dec2, xshift=3cm] {Sample $\sigma' \sim P(G_\cap)$}; + \node [above=0.07cm of samp2] {}; + + \draw[thick,dotted, rounded corners] ($(const.north west)+(-5.3,0.7)$) rectangle ($(samp2.south east)+(0.3,-0.6)$); + + \node (rank) [process, below of=dec2, yshift=-1.5cm] {Select top-$k$ by $L_\theta(\sigma')$}; + \node (vlmc) [io2, right of=rank, xshift=3cm] {Markov chain}; + \node [below=0.01cm of rank, xshift=-6cm] {\Large\textbf{Reranking}}; + \node [above=0.1cm of rank, xshift=1cm] {}; + \draw[thick,dotted, rounded corners] ($(rank.north west)+(-5.3,0.8)$) rectangle ($(rank.south east)+(5.3,-0.9)$); + +% \node (out1) [io, below of=pro2a] {Output}; + \node (stop) [startstop, below of=rank, yshift=-0.6cm]; + +% \draw [arrow] (dec0) -- node[anchor=east] {no} (pro1); + +% \draw [->,thick] (-5, 1.3) -- (synt); +% \draw [->,thick] (5, 1.3) -- (code); + +% \draw [arrow] (start) -- (code); +% \draw [arrow] (start) -- (synt); + \draw [arrow] (code) -- (lnfa); + \draw [arrow] (synt) -- (pcfg); + \draw [arrow] (lnfa) -- (pro1); + \draw [arrow] (pcfg) -- (pro1); + +% \draw [arrow] (in1) -- (pro1); + \draw [arrow] (pro1) -- (dec1); + \draw [arrow] (dec1) -- node[anchor=south] {yes} (pro2b); + \draw [arrow] (dec1) -- node[anchor=east,yshift=0.3cm] {no} (const); + \draw [arrow] (const) -- (dec2); + \draw [arrow] (pro2b) -- (lnfa); + \draw [arrow] (dec2) -- node[anchor=south] {small} (samp1); + \draw [arrow] (dec2) -- node[anchor=south] {large} (samp2); + + \draw [arrow] (vlmc) -- (rank); + \draw [arrow] (samp1) |- ([shift={(0,1.3cm)}]rank.north)--(rank.north); + \draw [arrow] (samp2) |- ([shift={(0,1.3cm)}]rank.north)--(rank.north); +% \draw [arrow] (pro2a) -- (out1); + \draw [arrow] (rank) -- (stop); +% \draw [arrow] (dec2) -- node[anchor=east] {1} (stop); - Since $R$ is intractable in general, we want a procedure that approximates it for a representative sampling of natural context-free grammars and strings, i.e., real-world programming languages and source code snippets. -\end{frame} + \end{tikzpicture} + } + \end{center} + \end{wrapfigure} -\begin{frame}[fragile]{From CFL Reachability to Real World Program Repair} - To fix real code, we needed to overcome a variety of interesting challenges:\vspace{10pt} + Our syntax repair procedure can be described in three high-level steps. First, we generate a synthetic grammar $(G_\cap)$ representing the intersection between the syntax $(G)$ and Levenshtein ball around the source code $\big(\Delta(\err\sigma, d)\big)$. During repair extraction, we retrieve as many repairs as possible from the intersection grammar via sampling or enumeration. Finally, we rank all repairs discovered by likelihood. - \begin{itemize} - \item \textbf{Syntax mismatch}: The syntax of real-world programming languages does not exactly correspond to the theory of formal languages. - \item \textbf{Source code $\approx$ PL}: Most of the time, source code in the wild is incomplete or only loosely approximates a programming language. - \item \textbf{Responsiveness}: The usefulness of synthetic repairs is inversely proportional to the amount of time required to generate them. - \item \textbf{Edit generation}: How do we generate edits that are (1) syntactically admissible (2) statistically plausible and (3) semantically meaningful? - \item \textbf{Evaluation}: Big code and version control is too coarse-grained, contains irrelevant edits, not representative of small errors/fixes. - \end{itemize} \end{frame} -\begin{frame}[fragile]{Design Criteria} - \begin{itemize} - \item First and foremost, the framework must be \textbf{sound} and \textbf{complete}. It must (1) accept arbitrary CFGs, and (2) recognize all and only syntactically valid strings within a fixed edit distance of a given string. - \item Second, we strongly prefer repairs to be \textbf{plausible} and \textbf{consistent}, i.e., likely to be expressed in practice and consistent with the author's intent and surrounding context (e.g., stylistically, semantically) - \item Third, recommendations must be \textbf{efficient} and \textbf{responsive}. We must be able to recognize valid strings in subcubic time, and generate repairs in (at worst) polynomial time. These criteria are necessary to provide real-time feedback whilst the user is typing. - \item Fourth, the framework must be \textbf{robust} and \textbf{scalable}. Ceteris paribus, the framework should be robust to multiple errors, handle large grammars and scale linearly with additional processors. - \end{itemize} +\begin{frame}[fragile]{Characteristics of the repair dataset} + \begin{figure}[h!] + \begin{tikzpicture}[scale=0.52] + \begin{axis}[ + ybar, + bar width=5pt, + xlabel={Length $(|\err\sigma|)$}, + ylabel={Frequency}, + title={Cumulative length distribution}, + axis x line*=bottom, + axis y line*=left, + ymin=0, + ymax=65, + xtick=data, + xticklabels={,,<30,,,<60,,,<90,}, + ymajorgrids=true, + grid style=dashed, + width=0.45\textwidth, + height=0.3\textwidth + ] + + \addplot[fill=black!30] table { + X Y + 1 7.60 + 2 14.52 + 3 22.01 + 4 30.54 + 5 37.82 + 6 44.30 + 7 49.68 + 8 55.21 + 9 59.75 + 10 63.59 + }; + \draw[red, dashed] (axis cs:8.5,0) -- (axis cs:8.5,65); + \end{axis} + \end{tikzpicture} + \begin{tikzpicture}[scale=0.52] + \begin{axis}[ + ybar, + bar width=5pt, + title={Human repair distance}, + xlabel={Edits, $\Delta(\err\sigma, \sigma')$}, + ylabel={Frequency}, + axis x line*=bottom, + axis y line*=left, + xtick=data, + ymajorgrids=true, + grid style=dashed, + xticklabels={,\leq 2,,\leq 4,,\leq 6,,\leq 8,,\leq 10}, + ytick={0, 20, 40, 60, 80, 100}, + ymin=0, + width=0.45\textwidth, + height=0.3\textwidth + ] + \addplot[fill=black!30] table { + X Y + 1 31.48 + 2 47.52 + 3 54.89 + 4 60.44 + 5 63.88 + 6 66.38 + 7 68.02 + 8 70.04 + 9 71.49 + 10 72.22 + }; + \draw[red, dashed] (axis cs:4.5,0) -- (axis cs:4.5,80); + \end{axis} + \end{tikzpicture} + \begin{tikzpicture}[scale=0.52] + \begin{axis}[ + ybar, + bar width=5pt, + xlabel={Beginning $\longleftrightarrow$ End}, + ylabel={Frequency}, + title={Normalized edit locations}, + axis x line*=bottom, + axis y line*=left, + ymin=0, + ymax=35, + xtick=data, + xticklabels={0\%,,,,,,,,,100\%}, + ymajorgrids=true, + grid style=dashed, + width=0.45\textwidth, + height=0.3\textwidth + ] + + \addplot[fill=black!30] table { + X Y + 10 11.6539 + 20 5.7252 + 30 6.2087 + 40 5.9542 + 50 5.5980 + 60 7.9389 + 70 7.0738 + 80 6.9466 + 90 12.4173 + 100 30.4835 + }; + \end{axis} + \end{tikzpicture} +% \begin{tikzpicture} +% \begin{axis}[ +% ybar, +% bar width=5pt, +% title={Intra-patch edit distance}, +% xlabel={Caret distance}, +% ylabel={Frequency}, +% axis x line*=bottom, +% axis y line*=left, +% xtick=data, +% ymajorgrids=true, +% grid style=dashed, +% xticklabels={1,2,3,4,5,6,7,8,9,10+}, +% width=0.45\textwidth, +% height=0.3\textwidth +% ] +% +% \addplot table { +% X Y +% 1 40.66 +% 2 15.00 +% 3 5.80 +% 4 4.86 +% 5 4.26 +% 6 2.98 +% 7 2.05 +% 8 2.73 +% 9 1.62 +% 10 13.64 +% }; +% \end{axis} +% \end{tikzpicture} + \begin{tikzpicture}[scale=0.52] + \begin{axis}[ + legend cell align={left}, + legend style={fill opacity=1, draw opacity=1, text opacity=1, draw=lightgray204, legend columns=-1, legend pos=north east, legend style={at={(0.5,1.8)},anchor=north}}, + xlabel={$|\err\sigma|$}, + ylabel={Stable region}, + title={Stability profile}, + ybar, + axis lines*=left, + xtick={0, 10, 20, 30, 40, 50, 60, 70}, + ytick={0, 0.2, 0.4, 0.6, 0.8, 1.0}, + xticklabels={, {[}10{,}20{)}, , {[}30{,}40{)}, , {[}50{,}60{)}, , {[}70{,}80{)}}, + yticklabels={0, 0.2, 0.4, 0.6, 0.8, 1.0}, + x tick label style={font=\scriptsize}, + ymax=1.0, + ymin=0.0, + bar width=3pt, + grid style=dashed, + ymajorgrids=true, + width=0.45\textwidth, + height=0.3\textwidth + ] + \addlegendimage{empty legend} + \addlegendentry{$\Delta(\err\sigma, \sigma')=$} + \addlegendimage{ybar,ybar legend,draw=none,green,fill=green!50} + \addlegendentry{1,} + \addlegendimage{ybar,ybar legend,draw=none,blue,fill=blue!50} + \addlegendentry{2,} + \addlegendimage{ybar,ybar legend,draw=none,orange,fill=orange!50} + \addlegendentry{3} + \addplot[green, fill=green!50] coordinates {(0, 0.80) (10, 0.91) (20, 0.96) (30, 0.97) (40, 0.99) (50, 0.99) (60, 0.99) (70, 0.99)}; + \addplot[blue, fill=blue!50] coordinates {(0, 0.35) (10, 0.59) (20, 0.69) (30, 0.73) (40, 0.79) (50, 0.82) (60, 0.84) (70, 0.86)}; + \addplot[orange, fill=orange!50] coordinates {(0, 0.23) (10, 0.45) (20, 0.58) (30, 0.66) (40, 0.70) (50, 0.77) (60, 0.78) (70, 0.86)}; + \end{axis} + \end{tikzpicture} + \caption{Repair statistics across the StackOverflow dataset, of which Tidyparse can handle about half in under $\sim$30s and $\sim$150 GB. Larger repairs and edit distances are possible, albeit requiring additional time and memory. The stability profile measures the average fraction of all edit locations that were never altered by any repair in the $L\big(\err\sigma, \Delta(\err\sigma, \sigma')\big)$-ball across repairs of varying length and distance.}\label{fig:patch_stats} + \end{figure} \end{frame} -\begin{frame}[fragile]{High-level architecture overview} - \begin{center} - \includegraphics[width=0.9\linewidth]{../figures/architecture_overview.pdf} - \end{center} - \vspace{-10pt} - \begin{footnotesize} - Our framework consists of three components, (2) a solver, (4) ranker and (3) sampler. Given an invalid string and grammar (1), we first compile them into a multilinear system of equations which can be solved directly, yielding a set of repairs that are ranked using a suitable scoring function (4). Optionally, we may introduce stochastic edits to the string using the Levenshtein ball sampler (3) and extract the solutions incrementally (5). - \end{footnotesize} +\begin{frame}[fragile]{Ranked repair} + We train on lexical n-grams using the standard MLE for Markov chains. To score the repairs, we use the conventional length-normalized NLL: + + \begin{equation} + \text{NLL}(\sigma) = -\frac{1}{|\sigma|}\sum_{i=1}^{|\sigma|}\log P_\theta(\sigma_i \mid \sigma_{i-1}\ldots\sigma_{i-n}) + \end{equation} + + For each retrieved set $\hat{A} \subseteq A$ drawn before a predetermined timeout and each $\sigma \in \hat{A}$, we score the repair and return $\hat{A}$ in ascending order. + + To evaluate the quality of our ranking, we use the Precision@k statistic. Specifically, given a repair model, $R: \Sigma^* \rightarrow 2^{\Sigma^*}$ and a parallel corpus, $\mathcal{D}_{\text{test}}$, of errors ($\sigma^\dagger$) and repairs ($\sigma'$), we define Precision@k as: + + \begin{equation} + \text{Precision@k}(R) = \frac{1}{|\mathcal{D}_{\text{test}}|}\sum_{\langle\sigma^\dagger, \sigma'\rangle \in \mathcal{D}_{\text{test}}} \mathds{1}\big[\sigma' \in \argmax_{\bm{\sigma} \subset R(\sigma^\dagger), |\bm{\sigma}| \leq k}\sum_{\sigma \in \bm{\sigma}}\text{NLL}(\sigma)\big] + \end{equation} \end{frame} -\begin{frame}[fragile]{Uniform sampling benchmark on natural syntax errors} - \begin{figure}[H] - \resizebox{.3\textwidth}{!}{\input{repair1-3_plot.tex}} - \resizebox{.3\textwidth}{!}{\input{repair1_plot.tex}} - \resizebox{.3\textwidth}{!}{\input{repair2_plot.tex}} +\begin{frame}[fragile]{Precision and latency comparison} + \begin{figure}[h!] + \resizebox{.24\textwidth}{!}{\input{../splash2024/len_dist_tidy}} + \resizebox{.24\textwidth}{!}{\input{../splash2024/len_dist_bifi_all}} + \resizebox{.24\textwidth}{!}{\input{../splash2024/len_dist_s2p}} + \resizebox{.24\textwidth}{!}{\input{../splash2024/len_dist_bifi}} + \caption{Tidyparse, Seq2Parse and BIFI repair precision across length and edits.} + \end{figure} + \begin{figure}[h!] +% \resizebox{.19\textwidth}{!}{\input{bar_hillel_repair.tex}} + \resizebox{.24\textwidth}{!}{\input{../splash2024/bar_hillel_repair_1}} + \resizebox{.24\textwidth}{!}{\input{../splash2024/bar_hillel_repair_2}} + \resizebox{.24\textwidth}{!}{\input{../splash2024/bar_hillel_repair_3}} + \resizebox{.24\textwidth}{!}{\input{../splash2024/bar_hillel_repair_4}} +% \resizebox{.24\textwidth}{!}{\input{bar_hillel_repair_5}} %\resizebox{.3\textwidth}{!}{\input{repair1_plot.tex}} %\resizebox{.307\textwidth}{!}{\input{repair2_plot.tex}} - \caption{Repairs discovered before the latency cutoff are reranked based on their tokenwise perplexity and compared for an exact lexical match with the human repair at or below rank k. We note that the uniform sampling procedure is not intended to be used in practice, but provides a baseline for the empirical density of the admissible set, and an upper bound on the latency required to attain a given precision.}\label{fig:human} + \caption{Latency benchmarks. Note the varying y-axis ranges. The red line marks Seq2Parse and the orange line marks BIFI's Precision@1 on the same repairs.}\label{fig:human} \end{figure} \end{frame} -\begin{frame}[fragile]{Adaptive sampling benchmark on natural syntax errors} - \begin{figure}[H] - \resizebox{.24\textwidth}{!}{\input{repair1-3_10s_plot.tex}} - \resizebox{.25\textwidth}{!}{\input{repair1_10s_plot.tex}} - \resizebox{.24\textwidth}{!}{\input{repair2_10s_plot.tex}} - \resizebox{.24\textwidth}{!}{\input{repair3_10s_plot.tex}} - \caption{Adaptive sampling repairs. The red line indicates Seq2Parse precision@1 on the same dataset. Since it only supports generating one repair, we do not report precision@k or the intermediate latency cutoffs.}\label{fig:adaptive} +\begin{frame}[fragile]{Results from sample efficiency experiments} + \begin{figure}[h!] + \input{../splash2024/sample_efficiency}\\ +% \caption{Sample efficiency of Tidyparse at varying Levenshtein radii. After drawing up to $\sim10^5$ samples without replacement we can usually retrieve the human repair for almost all repairs fewer than four edits.}\label{fig:sample_efficiency} + \resizebox{.35\textwidth}{!}{\input{../splash2024/throughput}}\hspace{1cm} + \resizebox{.35\textwidth}{!}{\input{../splash2024/experiments/timings}} \end{figure} \end{frame} -\begin{frame}[fragile]{Precision on Error Correction in Synthetic Language} - \begin{figure}[H] -\begin{tikzpicture}[scale=0.41] -\begin{axis}[ -width=8.3cm, -height=7cm, -title={\hspace{-1cm}\textbf{(1a) Latency with known locations}}, -ybar, -bar width=6pt, -xlabel={Number of holes}, -ylabel={ms to synthesize 10 repairs}, -xtick=data, -axis x line*=bottom, -axis y line*=left, -ytick pos=left, -xticklabels from table={\loctimings}{holes}, -ymajorgrids, -legend pos=north west, -legend columns=2, -error bars/y dir=both, -error bars/y explicit -] -\addplot table [x expr=\coordindex, y=d1, y error=d1err]{\loctimings}; -\addplot table [x expr=\coordindex, y=d2, y error=d2err]{\loctimings}; -\addplot table [x expr=\coordindex, y=d3, y error=d3err]{\loctimings}; -\addplot table [x expr=\coordindex, y=d4, y error=d4err]{\loctimings}; -\legend{Dyck-1, Dyck-2, Dyck-3, Dyck-4} -\end{axis} -\end{tikzpicture} -\begin{tikzpicture}[scale=0.41] -\begin{axis}[ -width=8.3cm, -height=7cm, -title={\hspace{-1cm}\textbf{(1b) Latency with unknown locations}}, -ybar, -bar width=20pt, -xlabel={Number of errors}, -%ylabel={ms to synthesize 1 repair}, -xtick=data, -axis x line*=bottom, -axis y line*=left, -enlarge x limits={abs=0.5}, -ymode=log, -ytick pos=left, -xticklabels from table={\unloctimings}{errors}, -ymajorgrids, -legend pos=north west, -error bars/y dir=both, -error bars/y explicit -] -\addplot table [x expr=\coordindex, y=d1]{\unloctimings}; -\addplot table [x expr=\coordindex, y=d2]{\unloctimings}; -\addplot table [x expr=\coordindex, y=d3]{\unloctimings}; -\legend{Dyck-1, Dyck-2, Dyck-3} -\end{axis} -\end{tikzpicture}\\ -\begin{tikzpicture}[scale=0.41] - \begin{axis}[ - width=8.3cm, - height=7cm, -% title={\hspace{-1cm}\textbf{Java Brackets}}, - ybar, - bar width=10pt, - xlabel={$|\Sigma^*|$}, - ylabel={Top-1 parser acceptance}, - title={\textbf{(2a) Synthetic Java bracket error correction}}, - xtick=data, - axis x line*=bottom, - axis y line*=left, - enlarge x limits={abs=0.5}, - ytick pos=left, - xticklabels from table={\syntheticerrors}{len}, - ymajorgrids, - legend pos=north east, - legend columns=3, - error bars/y dir=both, - error bars/y explicit - ] - \addplot table [x expr=\coordindex, y=10s]{\syntheticerrors}; - \addplot table [x expr=\coordindex, y=30s]{\syntheticerrors}; - \addplot table [x expr=\coordindex, y=60s]{\syntheticerrors}; - \legend{10s, 30s, 60s} - \end{axis} -\end{tikzpicture} -\begin{tikzpicture}[scale=0.41] - \begin{axis}[ - width=8.3cm, - height=7cm, - title={\textbf{(2b) Organic Python bracket error correction}}, - ybar, - bar width=10pt, - xlabel={$|\Sigma^*|$}, -%ylabel={Parser acceptance}, - xtick=data, - axis x line*=bottom, - axis y line*=left, - enlarge x limits={abs=0.5}, - ytick pos=left, - xticklabels from table={\naturalerrors}{len}, - ymajorgrids, - y tick label style={/pgf/number format/.cd,% - scaled y ticks = false, - set thousands separator={}, - fixed}, - legend pos=north east, - legend columns=3, - error bars/y dir=both, - error bars/y explicit - ] - \addplot table [x expr=\coordindex, y=10s]{\naturalerrors}; - \addplot table [x expr=\coordindex, y=30s]{\naturalerrors}; - \addplot table [x expr=\coordindex, y=60s]{\naturalerrors}; - \legend{10s, 30s, 60s} - \end{axis} -\end{tikzpicture} - \caption{Benchmarking bracket correction latency and accuracy across two bracketing langauges, one generated from Dyck-n, and the second uses an abstracted source code snippet with imbalanced parentheses.} -\end{figure} -\end{frame} - -\begin{frame}[fragile]{Uniform sampling benchmark} +\begin{frame}[fragile]{Outcomes in the syntax repair pipeline} + \vspace{-1cm} \begin{figure} - \resizebox{.5\textwidth}{!}{\input{throughput.tex}} - \caption{We evaluate throughput by sampling edits across invalid strings $|\err\sigma| \leq 40$ from the StackOverflow dataset of varying length, and measure the total number of syntactically valid edits discovered, as a function of string length and language edit distance $\Delta\in[1, 3]$. Each trial is terminated after 10 seconds, and the experiment is repeated across 7.3k total repairs.} + \resizebox{.83\textwidth}{!}{\input{../splash2024/sankey}} + \vspace{-1cm} + \caption{Sankey diagram of 967 total repair instances sampled uniformly from the StackOverflow Python dataset balanced acrost snippet lengths and edit distances ($\lfloor|\err\sigma| / 10\rfloor \in [0, 8], \Delta(\err\sigma, \sigma') < 4$) with a sampling timeout of 30s per repair.} \end{figure} \end{frame} -\begin{frame}[fragile]{Multicore Scaling Results (aarch64)} - \begin{tikzpicture} - \begin{axis}[ - ybar, - enlargelimits=0.15, - legend style={at={(0.03,0.97)},anchor=north west}, - title={\textbf{Relative Total Distinct Solutions Found vs. Single Core}}, - x=1cm, - ylabel={Relative improvement}, - xlabel={Number of assigned cores}, - symbolic x coords={2,3,4,5,6,7,8,9,10}, - xtick=data, - bar width=3pt, - ] - \addplot coordinates { - (2,0.1343) (3,0.1955) (4,0.2249) (5,0.2475) - (6,0.2760) (7,0.2994) (8,0.3073) (9,0.3151) (10,0.3229) - }; - \addplot coordinates { - (2,0.2655) (3,0.4353) (4,0.5614) (5,0.6644) - (6,0.7462) (7,0.7783) (8,0.8347) (9,0.8005) (10,0.7798) - }; - \addplot coordinates { - (2,0.3972) (3,0.6928) (4,0.9327) (5,1.1834) - (6,1.3138) (7,1.3988) (8,1.6039) (9,1.5500) (10,1.5691) - }; - \addplot coordinates { - (2,0.4863) (3,0.8326) (4,1.1368) (5,1.3879) - (6,1.5873) (7,1.7494) (8,1.8802) (9,1.9059) (10,1.9625) - }; - \addplot coordinates { - (2,0.4315) (3,0.7583) (4,1.0122) (5,1.2593) - (6,1.4586) (7,1.6349) (8,1.7813) (9,1.8324) (10,1.8695) - }; - \legend{Holes=2,Holes=3,Holes=4,Holes=5,Holes=6} - \end{axis} - \end{tikzpicture} -\end{frame} - \section{Formal Language Theory}\label{sec:fltheory} %------------------------------------------------------------------------------------------------ @@ -744,6 +939,202 @@ \section{Formal Language Theory}\label{sec:fltheory} \end{center} \end{frame} +\begin{frame}[fragile]{Levenshtein reachability} + \begin{figure}[H] + \resizebox{\textwidth}{!}{ + \begin{tikzpicture}[ +%->, % makes the edges directed + >=stealth', + node distance=2.5cm, % specifies the minimum distance between two nodes. Change if necessary. +% every state/.style={thick, fill=gray!10}, % sets the properties for each ’state’ node + initial text=$ $, % sets the text that appears on the start arrow + ] + \node[state, initial] (00) {$q_{0,0}$}; + \node[state, right of=00] (10) {$q_{1,0}$}; + \node[accepting, state, right of=10] (20) {$q_{2,0}$}; + \node[accepting, state, right of=20] (30) {$q_{3,0}$}; + \node[accepting, state, right of=30] (40) {$q_{4,0}$}; + \node[accepting, state, right of=40] (50) {$q_{5,0}$}; + + \node[state, above of=00, shift={(-2cm,0cm)}] (01) {$q_{0,1}$}; + \node[state, right of=01] (11) {$q_{1,1}$}; + \node[state, right of=11] (21) {$q_{2,1}$}; + \node[accepting, state, right of=21] (31) {$q_{3,1}$}; + \node[accepting, state, right of=31] (41) {$q_{4,1}$}; + \node[accepting, state, right of=41] (51) {$q_{5,1}$}; + + \node[state, above of=01, shift={(-2cm,0cm)}] (0j) {$q_{0,2}$}; + \node[state, right of=0j] (1j) {$q_{1,2}$}; + \node[state, right of=1j] (2j) {$q_{2,2}$}; + \node[state, right of=2j] (3j) {$q_{3,2}$}; + \node[accepting, state, right of=3j] (4j) {$q_{4,2}$}; + \node[accepting, state, right of=4j] (5j) {$q_{5,2}$}; + + \node[state, above of=0j, shift={(-2cm,0cm)}] (0k) {$q_{0,3}$}; + \node[state, right of=0k] (1k) {$q_{1,3}$}; + \node[state, right of=1k] (2k) {$q_{2,3}$}; + \node[state, right of=2k] (3k) {$q_{3,3}$}; + \node[state, right of=3k] (4k) {$q_{4,3}$}; + \node[accepting, state, right of=4k] (5k) {$q_{5,3}$}; + + \draw [->] (00) edge[below] node{$\sigma_1$} (10); + \draw [->] (10) edge[below] node{$\sigma_2$} (20); + \draw [->] (20) edge[below] node{$\sigma_3$} (30); + \draw [->] (30) edge[below] node{$\sigma_4$} (40); + \draw [->] (40) edge[below] node{$\sigma_5$} (50); + + \draw [->] (01) edge[below] node{$\sigma_1$} (11); + \draw [->] (11) edge[below] node[shift={(-0.2cm,0cm)}]{$\sigma_2$} (21); + \draw [->] (21) edge[below] node[shift={(-0.2cm,0cm)}]{$\sigma_3$} (31); + \draw [->] (31) edge[below] node[shift={(-0.2cm,0cm)}]{$\sigma_4$} (41); + \draw [->] (41) edge[below] node{$\sigma_5$} (51); + + \draw [->] (0j) edge[below] node{$\sigma_1$} (1j); + \draw [->] (1j) edge[below] node{$\sigma_2$} (2j); + \draw [->] (2j) edge[below] node{$\sigma_3$} (3j); + \draw [->] (3j) edge[below] node{$\sigma_4$} (4j); + \draw [->] (4j) edge[below] node{$\sigma_5$} (5j); + + \draw [->] (0k) edge[below] node{$\sigma_1$} (1k); + \draw [->] (1k) edge[below] node{$\sigma_2$} (2k); + \draw [->] (2k) edge[below] node{$\sigma_3$} (3k); + \draw [->] (3k) edge[below] node{$\sigma_4$} (4k); + \draw [->] (4k) edge[below] node{$\sigma_5$} (5k); + + \draw [->] (00) edge[left] node{$\phantom{\cdot}$} (11); + \draw [->] (10) edge[left] node{$\phantom{\cdot}$} (21); + \draw [->] (20) edge[left] node{$\phantom{\cdot}$} (31); + \draw [->] (30) edge[left] node{$\phantom{\cdot}$} (41); + \draw [->] (40) edge[left] node{$\phantom{\cdot}$} (51); + +% Super-knight arcs + \draw [->, red] (00) edge[bend right=8] node[east, shift={(-0.2cm,-0.7cm)}]{$\color{red}\sigma_3$} (3j); + \draw [->, red] (10) edge[bend right=8] node[east, shift={(-0.2cm,-0.7cm)}]{$\color{red}\sigma_4$} (4j); + \draw [->, red] (20) edge[bend right=8] node[east, shift={(-0.2cm,-0.7cm)}]{$\color{red}\sigma_5$} (5j); + + \draw [->, red] (01) edge[bend left=8] node[east, shift={(-0.2cm,-0.7cm)}]{$\color{red}\sigma_3$} (3k); + \draw [->, red] (11) edge[bend left=8] node[east, shift={(-0.2cm,-0.7cm)}]{$\color{red}\sigma_4$} (4k); + \draw [->, red] (21) edge[bend left=8] node[east, shift={(-0.2cm,-0.7cm)}]{$\color{red}\sigma_5$} (5k); + + \draw [->, violet] (00) edge node[east, shift={(-0.1cm,-0.8cm)}]{$\color{violet}\sigma_4$} (4k); + \draw [->, violet] (10) edge node[east, shift={(-0.1cm,-0.8cm)}]{$\color{violet}\sigma_5$} (5k); + + \draw [->] (01) edge[left] node{$\phantom{\cdot}$} (1j); + \draw [->] (11) edge[left] node{$\phantom{\cdot}$} (2j); + \draw [->] (21) edge[left] node{$\phantom{\cdot}$} (3j); + \draw [->] (31) edge[left] node{$\phantom{\cdot}$} (4j); + \draw [->] (41) edge[left] node{$\phantom{\cdot}$} (5j); + + \draw [->] (0j) edge[left] node{$\phantom{\cdot}$} (1k); + \draw [->] (1j) edge[left] node{$\phantom{\cdot}$} (2k); + \draw [->] (2j) edge[left] node{$\phantom{\cdot}$} (3k); + \draw [->] (3j) edge[left] node{$\phantom{\cdot}$} (4k); + \draw [->] (4j) edge[left] node{$\phantom{\cdot}$} (5k); + + \draw [->] (00) edge[bend left=10, left] node{$\phantom{\cdot}$} (01); + \draw [->] (10) edge[bend left=10, left] node{$\phantom{\cdot}$} (11); + \draw [->] (20) edge[bend left=10, left] node{$\phantom{\cdot}$} (21); + \draw [->] (30) edge[bend left=10, left] node{$\phantom{\cdot}$} (31); + \draw [->] (40) edge[bend left=10, left] node{$\phantom{\cdot}$} (41); + \draw [->] (50) edge[bend left=10, left] node{$\phantom{\cdot}$} (51); + + \draw [->] (01) edge[bend left=10, left] node{$\phantom{\cdot}$} (0j); + \draw [->] (11) edge[bend left=10, left] node{$\phantom{\cdot}$} (1j); + \draw [->] (21) edge[bend left=10, left] node{$\phantom{\cdot}$} (2j); + \draw [->] (31) edge[bend left=10, left] node{$\phantom{\cdot}$} (3j); + \draw [->] (41) edge[bend left=10, left] node{$\phantom{\cdot}$} (4j); + \draw [->] (51) edge[bend left=10, left] node{$\phantom{\cdot}$} (5j); + + \draw [->] (0j) edge[bend left=10, left] node{$\phantom{\cdot}$} (0k); + \draw [->] (1j) edge[bend left=10, left] node{$\phantom{\cdot}$} (1k); + \draw [->] (2j) edge[bend left=10, left] node{$\phantom{\cdot}$} (2k); + \draw [->] (3j) edge[bend left=10, left] node{$\phantom{\cdot}$} (3k); + \draw [->] (4j) edge[bend left=10, left] node{$\phantom{\cdot}$} (4k); + \draw [->] (5j) edge[bend left=10, left] node{$\phantom{\cdot}$} (5k); + + \draw [->, blue] (00) edge[bend right=11,below] node[shift={(0.5cm,0.3cm)}]{$\color{blue}\sigma_2$} (21); + \draw [->, blue] (10) edge[bend right=11,below] node[shift={(0.5cm,0.3cm)}]{$\color{blue}\sigma_3$} (31); + \draw [->, blue] (20) edge[bend right=11,below] node[shift={(0.5cm,0.3cm)}]{$\color{blue}\sigma_4$} (41); + \draw [->, blue] (30) edge[bend right=11,below] node[shift={(0.5cm,0.3cm)}]{$\color{blue}\sigma_5$} (51); + + \draw [->, blue] (01) edge[bend right=3,below] node[shift={(0.3cm,0.2cm)}]{$\color{blue}\sigma_2$} (2j); + \draw [->, blue] (11) edge[bend right=3,below] node[shift={(0.3cm,0.2cm)}]{$\color{blue}\sigma_3$} (3j); + \draw [->, blue] (21) edge[bend right=3,below] node[shift={(0.3cm,0.2cm)}]{$\color{blue}\sigma_4$} (4j); + \draw [->, blue] (31) edge[bend right=3,below] node[shift={(0.3cm,0.2cm)}]{$\color{blue}\sigma_4$} (5j); + + \draw [->, blue] (0j) edge[bend left=8,below] node[shift={(-0.45cm,-0.55cm)}]{$\color{blue}\sigma_2$} (2k); + \draw [->, blue] (1j) edge[bend left=8,below] node[shift={(-0.45cm,-0.55cm)}]{$\color{blue}\sigma_3$} (3k); + \draw [->, blue] (2j) edge[bend left=8,below] node[shift={(-0.45cm,-0.55cm)}]{$\color{blue}\sigma_4$} (4k); + \draw [->, blue] (3j) edge[bend left=8,below] node[shift={(-0.45cm,-0.55cm)}]{$\color{blue}\sigma_5$} (5k); + +%https://tex.stackexchange.com/a/20986/139648 + \draw [decorate,decoration={brace,amplitude=10pt,raise=10pt,mirror}] (00.south west) -- (50.south east) node[midway,yshift=-3em]{\textbf{String length}}; + \draw [decorate,decoration={brace,amplitude=10pt,raise=20pt}] (00.south west) -- (0k.north west) node[midway,xshift=-1cm,yshift=-1cm,rotate=-54]{\textbf{Edit distance}}; + \end{tikzpicture} + } + \caption{Bounded Levenshtein reachability from $\sigma: \Sigma^n$ is expressible as an NFA populated by accept states within radius $k$ of $S=q_{n,0}$, which accepts all strings $\sigma'$ within Levenshtein radius $k$ of $\sigma$.} + \end{figure} +\end{frame} + +\begin{frame}[fragile]{The nominal Levenshtein automaton} + The original Levenshtein automaton (Schulz \& Stoyan, 2002): + + \begin{prooftree} + \AxiomC{$s\in\Sigma \phantom{\land} i \in [0, n] \phantom{\land} j \in [1, k]$} + \RightLabel{$\duparrow$} + \UnaryInfC{$(q_{i, j-1} \overset{s}{\rightarrow} q_{i,j}) \in \delta$} + \DisplayProof + \hskip 1.5em + \AxiomC{$s\in\Sigma \phantom{\land} i \in [1, n] \phantom{\land} j \in [1, k]$} + \RightLabel{$\ddiagarrow$} + \UnaryInfC{$(q_{i-1, j-1} \overset{s}{\rightarrow} q_{i,j}) \in \delta$} + \end{prooftree} + \begin{prooftree} + \AxiomC{$s=\sigma_i \phantom{\land} i \in [1, n] \phantom{\land} j \in [0, k]$} + \RightLabel{$\drightarrow$} + \UnaryInfC{$(q_{i-1, j} \overset{s}{\rightarrow} q_{i,j}) \in \delta$} + \DisplayProof + \hskip 1.5em + \AxiomC{$s=\sigma_i \phantom{\land} i \in [2, n] \phantom{\land} j \in [1, k]$} + \RightLabel{$\knightarrow$} + \UnaryInfC{$(q_{i-2, j-1} \overset{s}{\rightarrow} q_{i,j}) \in \delta$} + \end{prooftree} + \begin{prooftree} + \AxiomC{$\vphantom{|}$} + \RightLabel{$\textsc{Init}$} + \UnaryInfC{$q_{0,0} \in I$} + \DisplayProof + \hskip 1.5em + \AxiomC{$q_{i, j}$} + \AxiomC{$|n-i+j| \leq k$} + \RightLabel{$\textsc{Done}$} + \BinaryInfC{$q_{i, j}\in F$} + \end{prooftree} + + We modify the original automaton with a nominal predicate: + + \begin{prooftree} + \AxiomC{$i \in [0, n] \phantom{\land} j \in [1, k]$} + \RightLabel{$\duparrow$} + \UnaryInfC{$(q_{i, j-1} \overset{{\color{orange}[\neq \sigma_i]}}{\rightarrow} q_{i,j}) \in \delta$} + \DisplayProof + \hskip 1.5em + \AxiomC{$i \in [1, n] \phantom{\land} j \in [1, k]$} + \RightLabel{$\ddiagarrow$} + \UnaryInfC{$(q_{i-1, j-1} \overset{{\color{orange}[\neq \sigma_i]}}{\rightarrow} q_{i,j}) \in \delta$} + \end{prooftree} + \begin{prooftree} + \AxiomC{$i \in [1, n] \phantom{\land} j \in [0, k]$} + \RightLabel{$\drightarrow$} + \UnaryInfC{$(q_{i-1, j} \overset{{\color{orange}[=\sigma_i]}}{\rightarrow} q_{i,j}) \in \delta$} + \DisplayProof + \hskip 1.5em + \AxiomC{$d \in [1, d_{\max}] \phantom{\land} i \in [d + 1, n] \phantom{\land} j \in [d, k]$} + \RightLabel{$\knightarrow$} + \UnaryInfC{$(q_{i-d-1, j-d} \overset{{\color{orange}[=\sigma_i]}}{\rightarrow} q_{i,j}) \in \delta$} + \end{prooftree} +\end{frame} + \begin{frame}{Background: Context-free grammars} In a context-free grammar $\mathcal{G} = \langle V, \Sigma, P, S\rangle$ all productions are of the form $P: V\times (V \cup \Sigma)^+$, i.e., RHS may contain any number of nonterminals, $V$. Recognition decidable in $n^\omega$, n.b. CFLs are \textbf{not} closed under intersection!\newline\\ % @@ -792,77 +1183,116 @@ \section{Formal Language Theory}\label{sec:fltheory} \end{center} \end{frame} -\begin{frame}{Background: Conjunctive grammars} - Conjunctive grammars naturally extend CFGs with CFL union and intersection, respecting closure under those operations. Equivalent to trellis automata, which are like contractive elementary cellular automata. Language inclusion is decidable in P.\\ - - \begin{prooftree} - \AxiomC{$\Gamma \vdash \mathcal{G}_1, \mathcal{G}_2 : \texttt{CG}$} - \RightLabel{$\cap$} - \UnaryInfC{$\Gamma \vdash \exists \:\mathcal{G}_3: \texttt{CG}\:.\:\mathcal{L}_{\mathcal{G}_1} \cap \mathcal{L}_{\mathcal{G}_1} \leftrightarrow \mathcal{G}_3$} - \end{prooftree} - - \begin{center} - \scalebox{0.8}{ - \begin{tikzpicture}[font=\sffamily,breathe dist/.initial=4ex] - \foreach \X [count=\Y,remember=\Y as \LastY] in - {finite,regular,context-free,conjunctive} - {\ifnum\Y=1 - \node[ellipse,draw,outer sep=0pt] (F-\Y) {\X}; - \else - \path[decoration={text along path, - text={|\sffamily|\X},text align=center,raise=0.9ex},decorate] - let \p1=($(F-\LastY.north)-(F-\LastY.west)$) - in (F-\LastY.west) arc(180:0:\x1 and \y1); - \path let \p1=($([yshift=\pgfkeysvalueof{/tikz/breathe dist}]F-\LastY.north) - -(F-\LastY.south)$), - \p2=($(F-1.east)-(F-1.west)$),\p3=($(F-1.north)-(F-1.south)$) - in ($([yshift=\pgfkeysvalueof{/tikz/breathe dist}]F-\LastY.north)!0.5!(F-\LastY.south)$) - node[minimum height=\y1,minimum width={\y1*\x2/\y3}, - draw,ellipse,inner sep=0pt, fill=black!30!white] (F-\Y){}; - \fi - } - \foreach \X [count=\Y,remember=\Y as \LastY] in - {finite,regular,context-free,conjunctive,context sensitive} - {\ifnum\Y=1 - \node[ellipse,draw,outer sep=0pt] (F-\Y) {\X}; - \else - \path[decoration={text along path, - text={|\sffamily|\X},text align=center,raise=0.9ex},decorate] - let \p1=($(F-\LastY.north)-(F-\LastY.west)$) - in (F-\LastY.west) arc(180:0:\x1 and \y1); - \path let \p1=($([yshift=\pgfkeysvalueof{/tikz/breathe dist}]F-\LastY.north) - -(F-\LastY.south)$), - \p2=($(F-1.east)-(F-1.west)$),\p3=($(F-1.north)-(F-1.south)$) - in ($([yshift=\pgfkeysvalueof{/tikz/breathe dist}]F-\LastY.north)!0.5!(F-\LastY.south)$) - node[minimum height=\y1,minimum width={\y1*\x2/\y3}, - draw,ellipse,inner sep=0pt] (F-\Y){}; - \fi} - \end{tikzpicture} - } - \end{center} -\end{frame} +%\begin{frame}{Background: Conjunctive grammars} +% Conjunctive grammars naturally extend CFGs with CFL union and intersection, respecting closure under those operations. Equivalent to trellis automata, which are like contractive elementary cellular automata. Language inclusion is decidable in P.\\ +% +% \begin{prooftree} +% \AxiomC{$\Gamma \vdash \mathcal{G}_1, \mathcal{G}_2 : \texttt{CG}$} +% \RightLabel{$\cap$} +% \UnaryInfC{$\Gamma \vdash \exists \:\mathcal{G}_3: \texttt{CG}\:.\:\mathcal{L}_{\mathcal{G}_1} \cap \mathcal{L}_{\mathcal{G}_1} \leftrightarrow \mathcal{G}_3$} +% \end{prooftree} +% +% \begin{center} +% \scalebox{0.8}{ +% \begin{tikzpicture}[font=\sffamily,breathe dist/.initial=4ex] +% \foreach \X [count=\Y,remember=\Y as \LastY] in +% {finite,regular,context-free,conjunctive} +% {\ifnum\Y=1 +% \node[ellipse,draw,outer sep=0pt] (F-\Y) {\X}; +% \else +% \path[decoration={text along path, +% text={|\sffamily|\X},text align=center,raise=0.9ex},decorate] +% let \p1=($(F-\LastY.north)-(F-\LastY.west)$) +% in (F-\LastY.west) arc(180:0:\x1 and \y1); +% \path let \p1=($([yshift=\pgfkeysvalueof{/tikz/breathe dist}]F-\LastY.north) +% -(F-\LastY.south)$), +% \p2=($(F-1.east)-(F-1.west)$),\p3=($(F-1.north)-(F-1.south)$) +% in ($([yshift=\pgfkeysvalueof{/tikz/breathe dist}]F-\LastY.north)!0.5!(F-\LastY.south)$) +% node[minimum height=\y1,minimum width={\y1*\x2/\y3}, +% draw,ellipse,inner sep=0pt, fill=black!30!white] (F-\Y){}; +% \fi +% } +% \foreach \X [count=\Y,remember=\Y as \LastY] in +% {finite,regular,context-free,conjunctive,context sensitive} +% {\ifnum\Y=1 +% \node[ellipse,draw,outer sep=0pt] (F-\Y) {\X}; +% \else +% \path[decoration={text along path, +% text={|\sffamily|\X},text align=center,raise=0.9ex},decorate] +% let \p1=($(F-\LastY.north)-(F-\LastY.west)$) +% in (F-\LastY.west) arc(180:0:\x1 and \y1); +% \path let \p1=($([yshift=\pgfkeysvalueof{/tikz/breathe dist}]F-\LastY.north) +% -(F-\LastY.south)$), +% \p2=($(F-1.east)-(F-1.west)$),\p3=($(F-1.north)-(F-1.south)$) +% in ($([yshift=\pgfkeysvalueof{/tikz/breathe dist}]F-\LastY.north)!0.5!(F-\LastY.south)$) +% node[minimum height=\y1,minimum width={\y1*\x2/\y3}, +% draw,ellipse,inner sep=0pt] (F-\Y){}; +% \fi} +% \end{tikzpicture} +% } +% \end{center} +%\end{frame} \begin{frame}[fragile]{Background: Closure properties of formal languages} Formal languages are not always closed under set-theoretic operations, e.g., CFL $\cap$ CFL is not CFL in general. Let $\cdot$ denote concatenation, $\star$ be Kleene star, and $\complement$ be complementation:\\ \begin{table} \begin{tabular}{c|ccccc} - & $\cup$ & $\cap$ & $\cdot$ & $\star$ & $\complement$ \\ - \hline - Finite$^1$ & \cmark & \cmark & \cmark & \cmark & \cmark \\ - Regular$^1$ & \cmark & \cmark & \cmark & \cmark & \cmark \\ - Context-free$^1$ & \cmark & \xmark & \cmark & \cmark & \xmark \\ - \rowcolor{slightgray} Conjunctive$^{1,2}$ & \cmark & \cmark & \cmark & \cmark & ? \\ - Context-sensitive$^2$ & \cmark & \cmark & \cmark & + & \cmark \\ - Recursively Enumerable$^2$ & \cmark & \cmark & \cmark & \cmark & \xmark \\ + & $\cup$ & $\cap$ & $\cdot$ & $\star$ & $\complement$ \\\hline + Finite$^1$ & \cmark & \cmark & \cmark & \cmark & \cmark \\ + Regular$^1$ & \cmark & \cmark & \cmark & \cmark & \cmark \\ + \rowcolor{slightgray} Context-free$^{1, 2}$ & \cmark & \xmark$^*$ & \cmark & \cmark & \xmark \\ + Conjunctive$^{1,2}$ & \cmark & \cmark & \cmark & \cmark & ? \\ + Context-sensitive$^2$ & \cmark & \cmark & \cmark & + & \cmark \\ + Recursively Enumerable$^2$ & \cmark & \cmark & \cmark & \cmark & \xmark \\ \end{tabular} \end{table} - We would like a language family that is (1) tractable, i.e., has polynomial recognition and search complexity and (2) reasonably expressive, i.e., can represent syntactic properties of real-world programming languages. + We would like a language family that is (1) tractable, i.e., has polynomial recognition and search complexity and (2) reasonably expressive, i.e., can represent syntactic properties of real-world programming languages.\vspace{0.2cm} + + $^*$ However, CFLs are closed under intersection with regular languages. +\end{frame} + +\begin{frame}[fragile]{The Bar-Hillel construction and its specialization} + The original Bar-Hillel construction provides a way to construct a grammar for the intersection of a regular and context-free language. + \noindent\begin{prooftree} + \AxiomC{$q \in I \phantom{\land} r \in F\vphantom{\overset{a}{\rightarrow}}$} + \RightLabel{$\downarrow$} + \UnaryInfC{$\big(S\rightarrow q S r\big) \in P_\cap$} + \DisplayProof + \hskip 2em + \AxiomC{$(A \rightarrow a) \in P$} + \AxiomC{$(q\overset{a}{\rightarrow}r) \in \delta$} + \RightLabel{$\uparrow$} + \BinaryInfC{$\big(qAr\rightarrow a\big)\in P_\cap$} + \DisplayProof + \AxiomC{$(w \rightarrow xz) \in P\vphantom{\overset{a}{\rightarrow}}$} + \AxiomC{$p,q,r \in Q$} + \RightLabel{$\Join$} + \BinaryInfC{$\big(pwr\rightarrow (pxq)(qzr)\big) \in P_\cap$} + \end{prooftree} + + We specialize the Bar-Hillel construction to nominal Levenshtein automata: + + \begin{prooftree} + \AxiomC{$(A \rightarrow a) \in P$} + \AxiomC{$(q\overset{{\color{orange}[\cdot]}}{\rightarrow}r) \in \delta$} + \AxiomC{$\color{orange}a[\cdot]$} + \RightLabel{$\uparrow$} + \TrinaryInfC{$\big(qAr\rightarrow a\big)\in P_\cap$} + \DisplayProof + \AxiomC{$\vphantom{\overset{[\cdot]}{\rightarrow}}\color{orange} w \triangleleft pr \phantom{\land} x \triangleleft pq \phantom{\land} z \triangleleft qr$} + \AxiomC{$(w \rightarrow xz) \in P\vphantom{\overset{a}{\rightarrow}}$} + \AxiomC{$p,q,r \in Q$} + \RightLabel{$\Join$} + \TrinaryInfC{$\big(pwr\rightarrow (pxq)(qzr)\big) \in P_\cap$} + \end{prooftree} + + Where $\triangleleft$ denotes compatibility between the Parikh map of a nonterminal and Levenshtein margin between two NFA states, see our paper for details. \end{frame} \section{Algebraic Parsing}\label{sec:algebraic-parsing} -\begin{frame}[fragile]{Context-free parsing, distilled} +\begin{frame}[fragile]{Parsing for linear algebraists} Given a CFG $\mathcal{G} \coloneqq \langle V, \Sigma, P, S\rangle$ in Chomsky Normal Form, we can construct a recognizer $R_\mathcal{G}: \Sigma^n \rightarrow \mathbb{B}$ for strings $\sigma: \Sigma^n$ as follows. Let $2^V$ be our domain, $0$ be $\varnothing$, $\oplus$ be $\cup$, and $\otimes$ be defined as follows: \vspace{-7pt} @@ -934,7 +1364,7 @@ \section{Algebraic Parsing}\label{sec:algebraic-parsing} \end{itemize} \end{frame} -\begin{frame}[fragile]{Satisfiability + holes (our contribution)} +\begin{frame}[fragile]{Satisfiability + holes} \begin{itemize} \item Can be lowered onto a Boolean tensor $\mathbb{B}_2^{n\times n \times |V|}$ (Valiant, 1975) \item Binarized CYK parser can be efficiently compiled to a SAT solver @@ -955,7 +1385,7 @@ \section{Algebraic Parsing}\label{sec:algebraic-parsing} \end{itemize} \end{frame} -\begin{frame}[fragile]{Satisfiability + holes (our contribution)} +\begin{frame}[fragile]{Satisfiability + holes} Let us consider an example with two holes, $\sigma = 1$ \hole\phantom{.}\hole, and the grammar being $G\coloneqq\{S\rightarrow N O N, O \rightarrow + \mid \times, N \rightarrow 0 \mid 1\}$. This can be rewritten into CNF as $G'\coloneqq \{S \rightarrow N L, N \rightarrow 0 \mid 1, O \rightarrow × \mid +, L \rightarrow O N\}$. Using the algebra where $\oplus=\cup$, $X \otimes Z = \big\{\;w \mid \langle x, z\rangle \in X \times Z, (w\rightarrow xz) \in P\;\big\}$, the fixpoint $M' = M + M^2$ can be computed as follows:\\\vspace{10pt} \resizebox{\textwidth}{!}{ @@ -1155,310 +1585,6 @@ \section{Algebraic Parsing}\label{sec:algebraic-parsing} \end{equation*} \end{frame} -\begin{frame}[fragile]{Chomsky Denormalization} - Chomksy normalization is needed for matrix-based parsing, however produces lopsided parse trees. We can denormalize them using a simple recusive procedure to restore the natural shape of the original CFG:\vspace{0.5cm}\\ - - \begin{minipage}[l]{6cm} - \vspace{0.3cm}\resizebox{\textwidth}{!}{ - \begin{tabular}{ll} - \Tree [.\texttt{S} \tikz\node(v1){\texttt{true}} [.$\ccancel{\texttt{and.S}}$ \tikz\node(v3){\texttt{and}} [.\texttt{S} \tikz\node(v5){\texttt{(}} [.$\ccancel{\texttt{S.)}}$ [.\texttt{S} \tikz\node(v9){\texttt{false}} [.$\ccancel{\texttt{or.S}}$ \tikz\node(v11){\texttt{or}} [.\texttt{S} \texttt{!} \texttt{true} ] ] ] \tikz\node(v7){\texttt{)}} ] ] ] ] -% \Tree [.S [.NP John ] [.VP [.\tikz\node(v1){V}; sleeps ] ] ] - \hspace{-2cm} - & - \Tree [.\texttt{S} \tikz\node(v2){\texttt{true}} \tikz\node(v4){\texttt{and}} [.\texttt{S} \tikz\node(v6){\texttt{(}} [.\texttt{S} \tikz\node(v10){\texttt{false}} \tikz\node(v12){\texttt{or}} [.\texttt{S} \texttt{!} \texttt{true} ] ] \tikz\node(v8){\texttt{)}} ] ]\\\\ -% \Tree [.\tikz\node(v2){V}; [.\tikz\node(v3){V}; ] [.Adv {a lot} ] ] - \hspace{1cm}\huge{Pre-Denormalization} & \hspace{3cm}\huge{Post-Denormalization} - \end{tabular} - \begin{tikzpicture}[overlay] -% \draw [red,dashed,-stealth] (v1) to[bend left] (v2); - \draw [red,dashed,-stealth] (v3) to[bend left] (v4); -% \draw [red,dashed,-stealth] (v5) to[bend left] (v6); - \draw [red,dashed,-stealth] (v7) to[bend left] (v8); -% \draw [red,dashed,-stealth] (v9) to[bend right] (v10); - \draw [red,dashed,-stealth] (v11) to[bend right] (v12); - \end{tikzpicture} - } -% \caption{Result of applying Algorithm~\ref{alg:cap} to the tree obtained by parsing the string: \texttt{true and ( false or ! true )}.} - \end{minipage} - \hspace{-0.7cm}\scalebox{0.6}{ - \begin{minipage}[l]{10cm} - \begin{algorithm}[H] - \caption{Rewrite procedure for tree denormalization}\label{alg:cap} - \begin{algorithmic} - \Procedure{Cut}{\texttt{t: Tree}} - \State $\texttt{stems} \leftarrow \{\:\textsc{Cut}(\texttt{c}) \mid \texttt{c} \in \texttt{t.children}\:\}$ - \If{$\texttt{t.root} \in (V_{\mathcal{G}'} \setminus V_{\mathcal{G}})$} - \State \textbf{return } \texttt{stems} %\Comment{Drop synthetic nonterminals.} - \Else%\Comment{Graft the denormalized children on root.} - \State \textbf{return } $\{\:\texttt{Tree(t.root, stems)}\:\}$ - \EndIf - \EndProcedure - \end{algorithmic} - \end{algorithm} - \end{minipage} -} - -\vspace{1cm}All synthetic nonterminals are excised during Chomsky denormalization. Rewriting improves legibility but does not alter the underlying semantics. -\end{frame} - -\begin{frame}[fragile]{Incremental parsing} - Should only need to recompute submatrices affected by individual edits. In the worst case, each edit requires quadratic complexity in terms of $|\Sigma^*|$, assuming $\mathcal{O}(1)$ cost for each CNF-nonterminal subset join, $\mathbf{V}'_1\otimes \mathbf{V}'_2$. - \begin{center} - \begin{tabular}{ c c c c c } - \scalebox{0.32}{\mkTrellisAppend{7}} & & \scalebox{0.32}{\mkTrellisInsert{6}} & & \scalebox{0.32}{\mkTrellisInsert{7}} \\ - Append & & Delete & & Insert \\ - $\mathcal{O}(n+1)$ & & $\mathcal{O}\left(\frac{1}{4}(n-1)^2\right)$ & & $\mathcal{O}\left(\frac{1}{4}(n+1)^2\right)$ \\ - \end{tabular} - \end{center} - Related to \textbf{dynamic matrix inverse} and \textbf{incremental transitive closure} with vertex updates. With a careful encoding, we can incrementally update SAT constraints as new keystrokes are received to eliminate redundancy. -\end{frame} - -%\begin{frame}{Error Recovery} -% \begin{figure}[H] -% \adjustbox{scale=0.75,center}{% -% \hspace{-0.5cm}\begin{minipage}[l]{6cm} -% \[ -% \begin{NiceMatrix} -% \leftarrow & \nse & \nsi & \nfi & \nfo & \nth & \ntw & \non & \leftarrow & \ppp \\ -% & & \ddd & \ddd & \ddd & \ddd & \ddd & \ddd & \ddd & \ppp \\ -% \sigma_1^\shri & \cdd & & A & & & & & & \ppp \\ -% \vno & \ddd & T_A & \vdd & & & & & & \ppp \\ -% \vdd & \ddd & & \pcd & \cdd & & B & & & \ppp \\ -% & & & & & T_B & \vdd & & & \ppp \\ -% & & & & & & \pcd & \cdd & & C \\ -% & & & & & & & & T_C & \vdd \\ -% & & & & & & & \text{\emoji{cross-mark}} & & \\ -% & & & & & & & & & \\ -% & & & & & & & & & \\ -% & & & & & & & & \ppp & \sigma_n^\shup \\ -% \vno & \cdd & & & & & & & \vno & -% \end{NiceMatrix} -% \] -% \end{minipage} -% } -% \hspace{1cm} -% -% \caption{By recursing over upper diagonals of decreasing elevation and discarding subtrees that fall under the shadow of another's canopy, we can recover parseable subtrees.} -% \end{figure} -%\end{frame} - -\begin{frame}[fragile]{Conjunctive parsing} - It is well-known that the family of CFLs is not closed under intersection. For example, consider $\mathcal{L}_\cap := \mathcal{L}_{\mathcal{G}_1} \cap \mathcal{L}_{\mathcal{G}_2}$: - - \begin{table}[H] - \begin{tabular}{llll} - $P_1 := \big\{\;S \rightarrow L R,$ & $L \rightarrow a b \mid a L b,$ & $R \rightarrow c \mid c R\;\big\}$\vspace{5pt} \\ - $P_2 := \big\{\;S \rightarrow L R,$ & $R \rightarrow b c \mid b R c,$ & $L \rightarrow a \mid a L\;\big\}$ - \end{tabular} - \end{table} - - \noindent Note that $\mathcal{L}_\cap$ generates the language $\big\{\;a^d b^d c^d \mid d > 0\;\big\}$, which according to the pumping lemma is not context-free. To encode $\mathcal{L}_\cap$, we intersect all terminals $\Sigma_\cap := \bigcap_{i=1}^c \Sigma_i$, then for each $t_\cap \in \Sigma_\cap$ and CFG, construct an equivalence class $E(t_\cap, \mathcal{G}_i) = \{ w_i \mid (w_i \rightarrow t_\cap) \in P_i\}$ as follows:\vspace{-5pt} - - \begin{align} - \bigwedge_{t\in\Sigma_\cap}\bigwedge_{j = 1}^{c-1}\bigwedge_{i=1}^{|\sigma|} E(t_{\cap}, \mathcal{G}_j) \equiv_{\sigma_i} E(t_{\cap}, \mathcal{G}_{j+1}) - \end{align} - % Generated by cfl4_intersection.vox, open with https://voxelator.com/ - \begin{figure}[H] - \includegraphics[height=0.093\textwidth]{../figures/angle1.png}\hspace{-5pt} - \includegraphics[height=0.093\textwidth]{../figures/angle2.png}\hspace{-5pt} - \includegraphics[height=0.093\textwidth]{../figures/angle5.png}\hspace{-5pt} - \includegraphics[height=0.093\textwidth]{../figures/angle3.png}\hspace{-3pt} - \includegraphics[height=0.093\textwidth]{../figures/angle4.png} - \end{figure} -\end{frame} - -\begin{frame}[fragile]{Levenshtein reachability and monotone infinite automata} - \begin{figure}[H] - \resizebox{\textwidth}{!}{ - \begin{tikzpicture}[ -%->, % makes the edges directed - >=stealth', - node distance=2.5cm, % specifies the minimum distance between two nodes. Change if necessary. -% every state/.style={thick, fill=gray!10}, % sets the properties for each ’state’ node - initial text=$ $, % sets the text that appears on the start arrow - ] - \node[state, initial] (00) {$q_{0,0}$}; - \node[state, right of=00] (10) {$q_{1,0}$}; - \node[accepting, state, right of=10] (20) {$q_{2,0}$}; - \node[accepting, state, right of=20] (30) {$q_{3,0}$}; - \node[accepting, state, right of=30] (40) {$q_{4,0}$}; - \node[accepting, state, right of=40] (50) {$q_{5,0}$}; - - \node[state, above of=00, shift={(-2cm,0cm)}] (01) {$q_{0,1}$}; - \node[state, right of=01] (11) {$q_{1,1}$}; - \node[state, right of=11] (21) {$q_{2,1}$}; - \node[accepting, state, right of=21] (31) {$q_{3,1}$}; - \node[accepting, state, right of=31] (41) {$q_{4,1}$}; - \node[accepting, state, right of=41] (51) {$q_{5,1}$}; - - \node[state, above of=01, shift={(-2cm,0cm)}] (0j) {$q_{0,2}$}; - \node[state, right of=0j] (1j) {$q_{1,2}$}; - \node[state, right of=1j] (2j) {$q_{2,2}$}; - \node[state, right of=2j] (3j) {$q_{3,2}$}; - \node[accepting, state, right of=3j] (4j) {$q_{4,2}$}; - \node[accepting, state, right of=4j] (5j) {$q_{5,2}$}; - - \node[state, above of=0j, shift={(-2cm,0cm)}] (0k) {$q_{0,3}$}; - \node[state, right of=0k] (1k) {$q_{1,3}$}; - \node[state, right of=1k] (2k) {$q_{2,3}$}; - \node[state, right of=2k] (3k) {$q_{3,3}$}; - \node[state, right of=3k] (4k) {$q_{4,3}$}; - \node[accepting, state, right of=4k] (5k) {$q_{5,3}$}; - - \draw [->] (00) edge[below] node{$\sigma_1$} (10); - \draw [->] (10) edge[below] node{$\sigma_2$} (20); - \draw [->] (20) edge[below] node{$\sigma_3$} (30); - \draw [->] (30) edge[below] node{$\sigma_4$} (40); - \draw [->] (40) edge[below] node{$\sigma_5$} (50); - - \draw [->] (01) edge[below] node{$\sigma_1$} (11); - \draw [->] (11) edge[below] node[shift={(-0.2cm,0cm)}]{$\sigma_2$} (21); - \draw [->] (21) edge[below] node[shift={(-0.2cm,0cm)}]{$\sigma_3$} (31); - \draw [->] (31) edge[below] node[shift={(-0.2cm,0cm)}]{$\sigma_4$} (41); - \draw [->] (41) edge[below] node{$\sigma_5$} (51); - - \draw [->] (0j) edge[below] node{$\sigma_1$} (1j); - \draw [->] (1j) edge[below] node{$\sigma_2$} (2j); - \draw [->] (2j) edge[below] node{$\sigma_3$} (3j); - \draw [->] (3j) edge[below] node{$\sigma_4$} (4j); - \draw [->] (4j) edge[below] node{$\sigma_5$} (5j); - - \draw [->] (0k) edge[below] node{$\sigma_1$} (1k); - \draw [->] (1k) edge[below] node{$\sigma_2$} (2k); - \draw [->] (2k) edge[below] node{$\sigma_3$} (3k); - \draw [->] (3k) edge[below] node{$\sigma_4$} (4k); - \draw [->] (4k) edge[below] node{$\sigma_5$} (5k); - - \draw [->] (00) edge[left] node{$\phantom{\cdot}$} (11); - \draw [->] (10) edge[left] node{$\phantom{\cdot}$} (21); - \draw [->] (20) edge[left] node{$\phantom{\cdot}$} (31); - \draw [->] (30) edge[left] node{$\phantom{\cdot}$} (41); - \draw [->] (40) edge[left] node{$\phantom{\cdot}$} (51); - -% Super-knight arcs - \draw [->, red] (00) edge[bend right=8] node[east, shift={(-0.2cm,-0.7cm)}]{$\color{red}\sigma_3$} (3j); - \draw [->, red] (10) edge[bend right=8] node[east, shift={(-0.2cm,-0.7cm)}]{$\color{red}\sigma_4$} (4j); - \draw [->, red] (20) edge[bend right=8] node[east, shift={(-0.2cm,-0.7cm)}]{$\color{red}\sigma_5$} (5j); - - \draw [->, red] (01) edge[bend left=8] node[east, shift={(-0.2cm,-0.7cm)}]{$\color{red}\sigma_3$} (3k); - \draw [->, red] (11) edge[bend left=8] node[east, shift={(-0.2cm,-0.7cm)}]{$\color{red}\sigma_4$} (4k); - \draw [->, red] (21) edge[bend left=8] node[east, shift={(-0.2cm,-0.7cm)}]{$\color{red}\sigma_5$} (5k); - - \draw [->, violet] (00) edge node[east, shift={(-0.1cm,-0.8cm)}]{$\color{violet}\sigma_4$} (4k); - \draw [->, violet] (10) edge node[east, shift={(-0.1cm,-0.8cm)}]{$\color{violet}\sigma_5$} (5k); - - \draw [->] (01) edge[left] node{$\phantom{\cdot}$} (1j); - \draw [->] (11) edge[left] node{$\phantom{\cdot}$} (2j); - \draw [->] (21) edge[left] node{$\phantom{\cdot}$} (3j); - \draw [->] (31) edge[left] node{$\phantom{\cdot}$} (4j); - \draw [->] (41) edge[left] node{$\phantom{\cdot}$} (5j); - - \draw [->] (0j) edge[left] node{$\phantom{\cdot}$} (1k); - \draw [->] (1j) edge[left] node{$\phantom{\cdot}$} (2k); - \draw [->] (2j) edge[left] node{$\phantom{\cdot}$} (3k); - \draw [->] (3j) edge[left] node{$\phantom{\cdot}$} (4k); - \draw [->] (4j) edge[left] node{$\phantom{\cdot}$} (5k); - - \draw [->] (00) edge[bend left=10, left] node{$\phantom{\cdot}$} (01); - \draw [->] (10) edge[bend left=10, left] node{$\phantom{\cdot}$} (11); - \draw [->] (20) edge[bend left=10, left] node{$\phantom{\cdot}$} (21); - \draw [->] (30) edge[bend left=10, left] node{$\phantom{\cdot}$} (31); - \draw [->] (40) edge[bend left=10, left] node{$\phantom{\cdot}$} (41); - \draw [->] (50) edge[bend left=10, left] node{$\phantom{\cdot}$} (51); - - \draw [->] (01) edge[bend left=10, left] node{$\phantom{\cdot}$} (0j); - \draw [->] (11) edge[bend left=10, left] node{$\phantom{\cdot}$} (1j); - \draw [->] (21) edge[bend left=10, left] node{$\phantom{\cdot}$} (2j); - \draw [->] (31) edge[bend left=10, left] node{$\phantom{\cdot}$} (3j); - \draw [->] (41) edge[bend left=10, left] node{$\phantom{\cdot}$} (4j); - \draw [->] (51) edge[bend left=10, left] node{$\phantom{\cdot}$} (5j); - - \draw [->] (0j) edge[bend left=10, left] node{$\phantom{\cdot}$} (0k); - \draw [->] (1j) edge[bend left=10, left] node{$\phantom{\cdot}$} (1k); - \draw [->] (2j) edge[bend left=10, left] node{$\phantom{\cdot}$} (2k); - \draw [->] (3j) edge[bend left=10, left] node{$\phantom{\cdot}$} (3k); - \draw [->] (4j) edge[bend left=10, left] node{$\phantom{\cdot}$} (4k); - \draw [->] (5j) edge[bend left=10, left] node{$\phantom{\cdot}$} (5k); - - \draw [->, blue] (00) edge[bend right=11,below] node[shift={(0.5cm,0.3cm)}]{$\color{blue}\sigma_2$} (21); - \draw [->, blue] (10) edge[bend right=11,below] node[shift={(0.5cm,0.3cm)}]{$\color{blue}\sigma_3$} (31); - \draw [->, blue] (20) edge[bend right=11,below] node[shift={(0.5cm,0.3cm)}]{$\color{blue}\sigma_4$} (41); - \draw [->, blue] (30) edge[bend right=11,below] node[shift={(0.5cm,0.3cm)}]{$\color{blue}\sigma_5$} (51); - - \draw [->, blue] (01) edge[bend right=3,below] node[shift={(0.3cm,0.2cm)}]{$\color{blue}\sigma_2$} (2j); - \draw [->, blue] (11) edge[bend right=3,below] node[shift={(0.3cm,0.2cm)}]{$\color{blue}\sigma_3$} (3j); - \draw [->, blue] (21) edge[bend right=3,below] node[shift={(0.3cm,0.2cm)}]{$\color{blue}\sigma_4$} (4j); - \draw [->, blue] (31) edge[bend right=3,below] node[shift={(0.3cm,0.2cm)}]{$\color{blue}\sigma_4$} (5j); - - \draw [->, blue] (0j) edge[bend left=8,below] node[shift={(-0.45cm,-0.55cm)}]{$\color{blue}\sigma_2$} (2k); - \draw [->, blue] (1j) edge[bend left=8,below] node[shift={(-0.45cm,-0.55cm)}]{$\color{blue}\sigma_3$} (3k); - \draw [->, blue] (2j) edge[bend left=8,below] node[shift={(-0.45cm,-0.55cm)}]{$\color{blue}\sigma_4$} (4k); - \draw [->, blue] (3j) edge[bend left=8,below] node[shift={(-0.45cm,-0.55cm)}]{$\color{blue}\sigma_5$} (5k); - -%https://tex.stackexchange.com/a/20986/139648 - \draw [decorate,decoration={brace,amplitude=10pt,raise=10pt,mirror}] (00.south west) -- (50.south east) node[midway,yshift=-3em]{\textbf{String length}}; - \draw [decorate,decoration={brace,amplitude=10pt,raise=20pt}] (00.south west) -- (0k.north west) node[midway,xshift=-1cm,yshift=-1cm,rotate=-54]{\textbf{Edit distance}}; - \end{tikzpicture} - } -\caption{Bounded Levenshtein reachability from $\sigma: \Sigma^n$ is expressible as an NFA populated by accept states within radius $k$ of $S=q_{n,0}$, which accepts all strings $\sigma'$ within Levenshtein radius $k$ of $\sigma$.} -\end{figure} -\end{frame} - -\begin{frame}[fragile]{The Chomsky-Levenshtein-Bar-Hillel Construction} -The original Bar-Hillel construction provides a way to construct a grammar for the intersection of a regular and context-free langauge. -\noindent\begin{prooftree} - \AxiomC{$q \in I \phantom{\land} r \in F\vphantom{\overset{a}{\rightarrow}}$} -% \RightLabel{$\varepsilon^+\textsc{-int}$} - \UnaryInfC{$\big(S\rightarrow q S r\big) \in P_\cap$} - \DisplayProof - \hskip 0.5em - \AxiomC{$(q\overset{a}{\rightarrow}r) \in \delta$} -% \RightLabel{$\varepsilon^+\textsc{-int}$} - \UnaryInfC{$\big(qar\rightarrow a\big)\in P_\cap$} - \DisplayProof - \hskip 0.5em -%\end{prooftree} -%\begin{prooftree} - \AxiomC{$(w \rightarrow xz) \in P\vphantom{\overset{a}{\rightarrow}}$} - \AxiomC{$p,q,r \in Q$} - \BinaryInfC{$\big(pwr\rightarrow (pxq)(qzr)\big) \in P_\cap$} -\end{prooftree} - -The Levenshtein automata is another kind of lattice, not in the order-theoretic sense, but in the automata-theoretic sense. - -\begin{prooftree} - \AxiomC{$s\in\Sigma \phantom{\land} i \in [0, n] \phantom{\land} j \in [1, k]$} - \RightLabel{$\duparrow$} - \UnaryInfC{$(q_{i, j-1} \overset{s}{\rightarrow} q_{i,j}) \in \delta$} - \DisplayProof - \hskip 1.5em - \AxiomC{$s\in\Sigma \phantom{\land} i \in [1, n] \phantom{\land} j \in [1, k]$} - \RightLabel{$\ddiagarrow$} - \UnaryInfC{$(q_{i-1, j-1} \overset{s}{\rightarrow} q_{i,j}) \in \delta$} -\end{prooftree} -\begin{prooftree} - \AxiomC{$s=\sigma_i \phantom{\land} i \in [1, n] \phantom{\land} j \in [0, k]$} - \RightLabel{$\drightarrow$} - \UnaryInfC{$(q_{i-1, j} \overset{s}{\rightarrow} q_{i,j}) \in \delta$} - \DisplayProof - \hskip 1.5em - \AxiomC{$s=\sigma_i \phantom{\land} i \in [2, n] \phantom{\land} j \in [1, k]$} - \RightLabel{$\knightarrow$} - \UnaryInfC{$(q_{i-2, j-1} \overset{s}{\rightarrow} q_{i,j}) \in \delta$} -\end{prooftree} -\begin{prooftree} - \AxiomC{$\vphantom{|}$} - \RightLabel{$\textsc{Init}$} - \UnaryInfC{$q_{0,0} \in I$} - \DisplayProof - \hskip 1.5em - \AxiomC{$q_{i, j}$} - \AxiomC{$|n-i+j| \leq k$} - \RightLabel{$\textsc{Done}$} - \BinaryInfC{$q_{i, j}\in F$} -\end{prooftree} -\end{frame} - %\begin{frame}[fragile]{A birds eye view of the algorithm} % \begin{figure}[H] % \adjustbox{scale=0.75,center}{% @@ -1502,90 +1628,66 @@ \section{Error Correction}\label{sec:error-correction} %We want a permutation mapping $f: A \rightarrow \mathbb{N} \mid \forall a \in A \exists i \in \mathbb{N}.f^{-1}(i) = a$. Then we can just sample $i \sim \mathbb{N}$ without replacement and apply $f^{-1}(i)$. %\end{frame} -\begin{frame}[fragile]{Error Correction: Levenshtein q-Balls} - Now that we have a reliable method to fix \textit{localized} errors, $S: \mathcal{G} \times (\Sigma\cup\{\varepsilon, \texttt{\texttt{\_}}\})^n \rightarrow \{\Sigma^n\}\subseteq \mathcal{L}_\mathcal{G}$, given some unparseable string, i.e., $\err{\sigma_1\ldots\:\sigma_n}: \highlight{\Sigma}^n \cap\mathcal{L}_\mathcal{G}^\complement$, where should we put holes to obtain a parseable $\sigma' \in \mathcal{L}_\mathcal{G}$? One way to do so is by sampling repairs, $\bm{\sigma}\sim\Sigma^{n\pm q}\cap\Delta_{q}(\err{\sigma})$ from the Levenshtein q-ball centered on $\err{\sigma}$, i.e., the space of all admissible edits with Levenshtein distance $\leq q$ (this is loosely analogous to a finite difference approximation). To admit variable-length edits, we first add an $\varepsilon^+$-production to each unit production:\vspace{5pt} - - \begin{prooftree} - \AxiomC{$\mathcal{G} \vdash \varepsilon \in \Sigma$} - \RightLabel{$\varepsilon\textsc{-dup}$} - \UnaryInfC{$\mathcal{G} \vdash (\varepsilon^+ \rightarrow \varepsilon \mid \varepsilon^+\:\varepsilon^+) \in P$} - \end{prooftree} - - \begin{prooftree} - \AxiomC{$\mathcal{G} \vdash (A \rightarrow B) \in P$} - \RightLabel{$\varepsilon^+\textsc{-int}$} - \UnaryInfC{$\mathcal{G} \vdash (A \rightarrow B\:\varepsilon^+ \mid \varepsilon^+\:B \mid B) \in P$} - \end{prooftree} -\end{frame} - -\begin{frame}[fragile]{Error Correction: d-Subset Sampling} - \noindent Next, suppose $U: \mathbb{Z}_2^{m\times m}$ is a matrix whose structure is shown in Eq.~\ref{eq:lfsr}, wherein $C$ is a primitive polynomial over $\mathbb{Z}_2^m$ with coefficients $C_{1\ldots m}$ and semiring operators $\oplus := \veebar, \otimes := \land$:\vspace{-5pt} - - \begin{align} - U^tV = \begin{pNiceMatrix}[nullify-dots,xdots/line-style=loosely dotted] - C_1 & \cdd & & & C_m \\ - \top & \circ & \cdd & & \circ \\ - \circ & \ddd & \ddd & & \vdd \\ - \vdd & \ddd & & & \\ - \circ & \cdd & \circ & \top & \circ - \end{pNiceMatrix}^t - \begin{pNiceMatrix}[nullify-dots,xdots/line-style=loosely dotted] - V_1 \\ - \vdd\\ - \\ - \\ - V_m - \end{pNiceMatrix}\label{eq:lfsr} - \end{align} - - \noindent Since $C$ is primitive, the sequence $\mathbf{S} = (U^{0 \ldots 2^m-1}V)$ must have \textit{full periodicity}, i.e., for all $i, j \in[0, 2^m)$, ${\mathbf{S}_i = \mathbf{S}_j \Rightarrow i = j}$. To uniformly sample $\bm\sigma$ without replacement, we first form an injection $\mathbb{Z}_2^m\rightharpoonup\stirlingii{n}{d}\times\Sigma_\varepsilon^{2d}$ using a combinatorial number system, cycle over $\mathbf{S}$, then discard samples which have no witness in $\stirlingii{n}{d}\times\Sigma_\varepsilon^{2d}$. This method requires $\widetilde{\mathcal O}(1)$ per sample and $\widetilde{\mathcal O}\left({n \choose d}|\Sigma + 1|^{2d}\right)$ to exhaustively search $\stirlingii{n}{d}\times\Sigma_\varepsilon^{2d}$. -\end{frame} - -\begin{frame}[fragile]{Error Correction: Sketch Templates} - Finally, to sample $\bm{\sigma}\sim\Delta_{q}(\err{\sigma})$, we enumerate a series of sketch templates $H(\sigma, i) = \sigma_{1\ldots i-1}\:\text{\texttt{\_} \texttt{\_}}\:\sigma_{i+1\ldots n}$ for each $i \in \cdot \in \stirlingii{n}{d}$ and $d \in 1\ldots q$, then solve for $\mathcal{M}_{\bm\sigma}^*$. If $S \in \Lambda^*_{\bm\sigma}?$ has a solution, each edit in each $\sigma' \in \bm\sigma$ will match exactly one of the following seven edit patterns:\vspace{-10pt} - - \begin{align*} - \text{Deletion}&=\begin{cases} - \,\ldots\sigma_{i-1}\:\text{\hlred{$\gamma_1$}\:\hlred{$\gamma_2$}}\:\sigma_{i+1}\ldots\hspace{0.2cm}\gamma_{1, 2} = \varepsilon\label{eq:del} - \end{cases}\\ - \text{Substitution}&=\begin{cases} - \ldots\sigma_{i-1}\:\text{\hlorange{$\gamma_1$}\:\hlred{$\gamma_2$}}\:\sigma_{i+1}\ldots\hspace{0.2cm}\gamma_1 \neq \varepsilon \land \gamma_2 = \varepsilon\\ - \ldots\sigma_{i-1}\:\text{\hlred{$\gamma_1$}\:\hlorange{$\gamma_2$}}\:\sigma_{i+1}\ldots\hspace{0.2cm}\gamma_1 = \varepsilon \land \gamma_2 \neq \varepsilon\\ - \ldots\sigma_{i-1}\:\text{\hlorange{$\gamma_1$}\:\hlorange{$\gamma_2$}}\:\sigma_{i+1}\ldots\hspace{0.2cm}\{\gamma_1, \gamma_2\}\cap\{\varepsilon, \sigma_i\} = \varnothing - \end{cases}\\ - \text{Insertion}&=\begin{cases} - \ldots\sigma_{i-1}\:\text{\hlgreen{$\gamma_1$}\:\hlorange{$\gamma_2$}}\:\sigma_{i+1}\ldots\hspace{0.2cm}\gamma_1 = \sigma_i \land \gamma_2 \notin \{\varepsilon, \sigma_i\}\label{eq:ins2}\\ - \ldots\sigma_{i-1}\:\text{\hlorange{$\gamma_1$}\:\hlgreen{$\gamma_2$}}\:\sigma_{i+1}\ldots\hspace{0.2cm}\gamma_1 \notin \{\varepsilon, \sigma_i\} \land \gamma_2 = \sigma_i\label{eq:ins1}\\ - \ldots\sigma_{i-1}\:\text{\hlgreen{$\gamma_1$}\:\hlgreen{$\gamma_2$}}\:\sigma_{i+1}\ldots\hspace{0.2cm}\gamma_{1,2} = \sigma_i\label{eq:copy} - \end{cases} - \end{align*} -\end{frame} - - -\begin{frame}[fragile]{An Simple Reachability Proof} -\begin{lemma} -For any nonempty language $\ell: \mathcal{L}(\mathcal{G})$ and invalid string $\err{\sigma}: \Sigma^n$, there exists an $(\tilde{\sigma}, m)$ such that $\tilde{\sigma} \in \ell\cap\Sigma^m$ and $0 < \Delta(\err{\sigma}, \ell) \leq \max(m, n) < \infty$, where $\Delta$ denotes the Levenshtein edit distance.\\ -\end{lemma} - -\begin{proof} -Since $\ell$ is nonempty, it must have at least one inhabitant $\sigma \in \ell$. Let $\tilde{\sigma}$ be the smallest such member. Since $\tilde{\sigma}$ is a valid sentence in $\ell$, by definition it must be that $|\tilde{\sigma}|<\infty$. Let $m:=|\tilde{\sigma}|$. Since we know $\err{\sigma} \notin \ell$, it follows that $0 < \Delta(\err{\sigma}, \ell)$. Let us consider two cases, either $\tilde{\sigma} = \varepsilon$, or $0 < |\tilde{\sigma}|$: - -\begin{itemize} -\item If $\tilde{\sigma} = \varepsilon$, then $\Delta(\err{\sigma}, \tilde{\sigma}) = n$ by full erasure of $\err{\sigma}$, or -\item If $0 < m$, then $\Delta(\err{\sigma}, \tilde{\sigma}) \leq \max(m, n)$ by overwriting. -\end{itemize} +%\begin{frame}[fragile]{Error Correction: Levenshtein q-Balls} +% Now that we have a reliable method to fix \textit{localized} errors, $S: \mathcal{G} \times (\Sigma\cup\{\varepsilon, \texttt{\texttt{\_}}\})^n \rightarrow \{\Sigma^n\}\subseteq \mathcal{L}_\mathcal{G}$, given some unparseable string, i.e., $\err{\sigma_1\ldots\:\sigma_n}: \highlight{\Sigma}^n \cap\mathcal{L}_\mathcal{G}^\complement$, where should we put holes to obtain a parseable $\sigma' \in \mathcal{L}_\mathcal{G}$? One way to do so is by sampling repairs, $\bm{\sigma}\sim\Sigma^{n\pm q}\cap\Delta_{q}(\err{\sigma})$ from the Levenshtein q-ball centered on $\err{\sigma}$, i.e., the space of all admissible edits with Levenshtein distance $\leq q$ (this is loosely analogous to a finite difference approximation). To admit variable-length edits, we first add an $\varepsilon^+$-production to each unit production:\vspace{5pt} +% +% \begin{prooftree} +% \AxiomC{$\mathcal{G} \vdash \varepsilon \in \Sigma$} +% \RightLabel{$\varepsilon\textsc{-dup}$} +% \UnaryInfC{$\mathcal{G} \vdash (\varepsilon^+ \rightarrow \varepsilon \mid \varepsilon^+\:\varepsilon^+) \in P$} +% \end{prooftree} +% +% \begin{prooftree} +% \AxiomC{$\mathcal{G} \vdash (A \rightarrow B) \in P$} +% \RightLabel{$\varepsilon^+\textsc{-int}$} +% \UnaryInfC{$\mathcal{G} \vdash (A \rightarrow B\:\varepsilon^+ \mid \varepsilon^+\:B \mid B) \in P$} +% \end{prooftree} +%\end{frame} +% +%\begin{frame}[fragile]{Error Correction: d-Subset Sampling} +% \noindent Next, suppose $U: \mathbb{Z}_2^{m\times m}$ is a matrix whose structure is shown in Eq.~\ref{eq:lfsr}, wherein $C$ is a primitive polynomial over $\mathbb{Z}_2^m$ with coefficients $C_{1\ldots m}$ and semiring operators $\oplus := \veebar, \otimes := \land$:\vspace{-5pt} +% +% \begin{align} +% U^tV = \begin{pNiceMatrix}[nullify-dots,xdots/line-style=loosely dotted] +% C_1 & \cdd & & & C_m \\ +% \top & \circ & \cdd & & \circ \\ +% \circ & \ddd & \ddd & & \vdd \\ +% \vdd & \ddd & & & \\ +% \circ & \cdd & \circ & \top & \circ +% \end{pNiceMatrix}^t +% \begin{pNiceMatrix}[nullify-dots,xdots/line-style=loosely dotted] +% V_1 \\ +% \vdd\\ +% \\ +% \\ +% V_m +% \end{pNiceMatrix}\label{eq:lfsr} +% \end{align} +% +% \noindent Since $C$ is primitive, the sequence $\mathbf{S} = (U^{0 \ldots 2^m-1}V)$ must have \textit{full periodicity}, i.e., for all $i, j \in[0, 2^m)$, ${\mathbf{S}_i = \mathbf{S}_j \Rightarrow i = j}$. To uniformly sample $\bm\sigma$ without replacement, we first form an injection $\mathbb{Z}_2^m\rightharpoonup\stirlingii{n}{d}\times\Sigma_\varepsilon^{2d}$ using a combinatorial number system, cycle over $\mathbf{S}$, then discard samples which have no witness in $\stirlingii{n}{d}\times\Sigma_\varepsilon^{2d}$. This method requires $\widetilde{\mathcal O}(1)$ per sample and $\widetilde{\mathcal O}\left({n \choose d}|\Sigma + 1|^{2d}\right)$ to exhaustively search $\stirlingii{n}{d}\times\Sigma_\varepsilon^{2d}$. +%\end{frame} +% +%\begin{frame}[fragile]{Error Correction: Sketch Templates} +% Finally, to sample $\bm{\sigma}\sim\Delta_{q}(\err{\sigma})$, we enumerate a series of sketch templates $H(\sigma, i) = \sigma_{1\ldots i-1}\:\text{\texttt{\_} \texttt{\_}}\:\sigma_{i+1\ldots n}$ for each $i \in \cdot \in \stirlingii{n}{d}$ and $d \in 1\ldots q$, then solve for $\mathcal{M}_{\bm\sigma}^*$. If $S \in \Lambda^*_{\bm\sigma}?$ has a solution, each edit in each $\sigma' \in \bm\sigma$ will match exactly one of the following seven edit patterns:\vspace{-10pt} +% +% \begin{align*} +% \text{Deletion}&=\begin{cases} +% \,\ldots\sigma_{i-1}\:\text{\hlred{$\gamma_1$}\:\hlred{$\gamma_2$}}\:\sigma_{i+1}\ldots\hspace{0.2cm}\gamma_{1, 2} = \varepsilon\label{eq:del} +% \end{cases}\\ +% \text{Substitution}&=\begin{cases} +% \ldots\sigma_{i-1}\:\text{\hlorange{$\gamma_1$}\:\hlred{$\gamma_2$}}\:\sigma_{i+1}\ldots\hspace{0.2cm}\gamma_1 \neq \varepsilon \land \gamma_2 = \varepsilon\\ +% \ldots\sigma_{i-1}\:\text{\hlred{$\gamma_1$}\:\hlorange{$\gamma_2$}}\:\sigma_{i+1}\ldots\hspace{0.2cm}\gamma_1 = \varepsilon \land \gamma_2 \neq \varepsilon\\ +% \ldots\sigma_{i-1}\:\text{\hlorange{$\gamma_1$}\:\hlorange{$\gamma_2$}}\:\sigma_{i+1}\ldots\hspace{0.2cm}\{\gamma_1, \gamma_2\}\cap\{\varepsilon, \sigma_i\} = \varnothing +% \end{cases}\\ +% \text{Insertion}&=\begin{cases} +% \ldots\sigma_{i-1}\:\text{\hlgreen{$\gamma_1$}\:\hlorange{$\gamma_2$}}\:\sigma_{i+1}\ldots\hspace{0.2cm}\gamma_1 = \sigma_i \land \gamma_2 \notin \{\varepsilon, \sigma_i\}\label{eq:ins2}\\ +% \ldots\sigma_{i-1}\:\text{\hlorange{$\gamma_1$}\:\hlgreen{$\gamma_2$}}\:\sigma_{i+1}\ldots\hspace{0.2cm}\gamma_1 \notin \{\varepsilon, \sigma_i\} \land \gamma_2 = \sigma_i\label{eq:ins1}\\ +% \ldots\sigma_{i-1}\:\text{\hlgreen{$\gamma_1$}\:\hlgreen{$\gamma_2$}}\:\sigma_{i+1}\ldots\hspace{0.2cm}\gamma_{1,2} = \sigma_i\label{eq:copy} +% \end{cases} +% \end{align*} +%\end{frame} -In either case, it follows $\Delta(\err{\sigma}, \ell) \leq \max(m, n)$ and $\ell$ is always reachable via a finite nonempty set of Levenshtein edits, i.e., $0 < \Delta(\err{\sigma}, \ell) < \infty$. -\end{proof} -\end{frame} -\begin{frame}[fragile]{Probabilistic repair generation} -\resizebox{\textwidth}{!}{ -\begin{minipage}{1.4\textwidth} -\input{prob_reach.tex} -\end{minipage} -} -\end{frame} %\begin{frame}[fragile]{Level I: Known Error Locations} % \begin{figure}[H] @@ -1732,7 +1834,7 @@ \section{Error Correction}\label{sec:error-correction} \item \href{http://www-igm.univ-mlv.fr/~berstel/Mps/Travaux/A/1963-7ChomskyAlgebraic.pdf}{Chomsky \& Sch\"utzenberger (1959) - The algebraic theory of CFLs} \item Cocke–Younger–Kasami (1961) - Bottom-up matrix-based parsing \item \href{https://dl.acm.org/doi/10.1145/321239.321249}{Brzozowski (1964) - Derivatives of regular expressions} - \item \href{https://dl.acm.org/doi/10.1145/362007.362035}{Earley (1968) - top-down dynamic programming (no CNF needed)} +% \item \href{https://dl.acm.org/doi/10.1145/362007.362035}{Earley (1968) - top-down dynamic programming (no CNF needed)} \item \href{http://theory.stanford.edu/~virgi/cs367/papers/valiantcfg.pdf}{Valiant (1975) - first realizes the Boolean matrix correspondence} \begin{itemize} \item Na\"ively, has complexity $\mathcal{O}(n^4)$, can be reduced to $\mathcal{O}(n^\omega)$, $\omega < 2.763$ @@ -1743,6 +1845,7 @@ \section{Error Correction}\label{sec:error-correction} \item \href{https://arxiv.org/pdf/1601.07724.pdf}{Bernady \& Jansson (2015) - Certifies Valiant (1975) in Agda} \item \href{https://arxiv.org/pdf/1504.08342.pdf}{Cohen \& Gildea (2016) - Generalizes Valiant (1975) to parse and recognize mildly context sensitive languages, e.g. LCFRS, TAG, CCG} \item \textbf{Considine, Guo \& Si (2022) - SAT + Valiant (1975) + holes} + \item \textbf{Considine, Guo \& Si (2024) - Levenshtein Bar-Hillel repairs} \end{itemize} \end{frame} @@ -2310,10 +2413,11 @@ \section{Error Correction}\label{sec:error-correction} \begin{frame}{Special thanks} \begin{center} \LARGE{ - Jin Guo, Xujie Si\\ - Brigitte Pientka, David Yu-Tung Hui,\\ + Jin Guo, Xujie Si, David Bieber,\\ + David Chiang, Brigitte Pientka, David Hui,\\ Ori Roth, Younesse Kaddar, Michael Schröder\\ - Torsten Scholak, Matthew Sotoudeh, Paul Zhu\\ + Will Chrichton, Kristopher Micinski, Alex Lew\\ + Matthijs Vákár, Michael Coblenz, Maddy Bowers\\ \phantom{}\\ } \href{https://cs.mcgill.ca}{\includegraphics[scale=0.06]{../figures/mcgill_logo.png}} @@ -2327,4 +2431,180 @@ \section{Error Correction}\label{sec:error-correction} \href{https://tidyparse.github.io}{\color{blue}{https://tidyparse.github.io}}} \end{center} \end{frame} + +%\begin{frame}[fragile]{Chomsky Denormalization} +% Chomksy normalization is needed for matrix-based parsing, however produces lopsided parse trees. We can denormalize them using a simple recusive procedure to restore the natural shape of the original CFG:\vspace{0.5cm}\\ +% +% \begin{minipage}[l]{6cm} +% \vspace{0.3cm}\resizebox{\textwidth}{!}{ +% \begin{tabular}{ll} +% \Tree [.\texttt{S} \tikz\node(v1){\texttt{true}} [.$\ccancel{\texttt{and.S}}$ \tikz\node(v3){\texttt{and}} [.\texttt{S} \tikz\node(v5){\texttt{(}} [.$\ccancel{\texttt{S.)}}$ [.\texttt{S} \tikz\node(v9){\texttt{false}} [.$\ccancel{\texttt{or.S}}$ \tikz\node(v11){\texttt{or}} [.\texttt{S} \texttt{!} \texttt{true} ] ] ] \tikz\node(v7){\texttt{)}} ] ] ] ] +%% \Tree [.S [.NP John ] [.VP [.\tikz\node(v1){V}; sleeps ] ] ] +% \hspace{-2cm} +% & +% \Tree [.\texttt{S} \tikz\node(v2){\texttt{true}} \tikz\node(v4){\texttt{and}} [.\texttt{S} \tikz\node(v6){\texttt{(}} [.\texttt{S} \tikz\node(v10){\texttt{false}} \tikz\node(v12){\texttt{or}} [.\texttt{S} \texttt{!} \texttt{true} ] ] \tikz\node(v8){\texttt{)}} ] ]\\\\ +%% \Tree [.\tikz\node(v2){V}; [.\tikz\node(v3){V}; ] [.Adv {a lot} ] ] +% \hspace{1cm}\huge{Pre-Denormalization} & \hspace{3cm}\huge{Post-Denormalization} +% \end{tabular} +% \begin{tikzpicture}[overlay] +%% \draw [red,dashed,-stealth] (v1) to[bend left] (v2); +% \draw [red,dashed,-stealth] (v3) to[bend left] (v4); +%% \draw [red,dashed,-stealth] (v5) to[bend left] (v6); +% \draw [red,dashed,-stealth] (v7) to[bend left] (v8); +%% \draw [red,dashed,-stealth] (v9) to[bend right] (v10); +% \draw [red,dashed,-stealth] (v11) to[bend right] (v12); +% \end{tikzpicture} +% } +%% \caption{Result of applying Algorithm~\ref{alg:cap} to the tree obtained by parsing the string: \texttt{true and ( false or ! true )}.} +% \end{minipage} +% \hspace{-0.7cm}\scalebox{0.6}{ +% \begin{minipage}[l]{10cm} +% \begin{algorithm}[H] +% \caption{Rewrite procedure for tree denormalization}\label{alg:cap} +% \begin{algorithmic} +% \Procedure{Cut}{\texttt{t: Tree}} +% \State $\texttt{stems} \leftarrow \{\:\textsc{Cut}(\texttt{c}) \mid \texttt{c} \in \texttt{t.children}\:\}$ +% \If{$\texttt{t.root} \in (V_{\mathcal{G}'} \setminus V_{\mathcal{G}})$} +% \State \textbf{return } \texttt{stems} %\Comment{Drop synthetic nonterminals.} +% \Else%\Comment{Graft the denormalized children on root.} +% \State \textbf{return } $\{\:\texttt{Tree(t.root, stems)}\:\}$ +% \EndIf +% \EndProcedure +% \end{algorithmic} +% \end{algorithm} +% \end{minipage} +% } +% +% \vspace{1cm}All synthetic nonterminals are excised during Chomsky denormalization. Rewriting improves legibility but does not alter the underlying semantics. +%\end{frame} +% +%\begin{frame}[fragile]{Incremental parsing} +% Should only need to recompute submatrices affected by individual edits. In the worst case, each edit requires quadratic complexity in terms of $|\Sigma^*|$, assuming $\mathcal{O}(1)$ cost for each CNF-nonterminal subset join, $\mathbf{V}'_1\otimes \mathbf{V}'_2$. +% \begin{center} +% \begin{tabular}{ c c c c c } +% \scalebox{0.32}{\mkTrellisAppend{7}} & & \scalebox{0.32}{\mkTrellisInsert{6}} & & \scalebox{0.32}{\mkTrellisInsert{7}} \\ +% Append & & Delete & & Insert \\ +% $\mathcal{O}(n+1)$ & & $\mathcal{O}\left(\frac{1}{4}(n-1)^2\right)$ & & $\mathcal{O}\left(\frac{1}{4}(n+1)^2\right)$ \\ +% \end{tabular} +% \end{center} +% Related to \textbf{dynamic matrix inverse} and \textbf{incremental transitive closure} with vertex updates. With a careful encoding, we can incrementally update SAT constraints as new keystrokes are received to eliminate redundancy. +%\end{frame} +% +%%\begin{frame}{Error Recovery} +%% \begin{figure}[H] +%% \adjustbox{scale=0.75,center}{% +%% \hspace{-0.5cm}\begin{minipage}[l]{6cm} +%% \[ +%% \begin{NiceMatrix} +%% \leftarrow & \nse & \nsi & \nfi & \nfo & \nth & \ntw & \non & \leftarrow & \ppp \\ +%% & & \ddd & \ddd & \ddd & \ddd & \ddd & \ddd & \ddd & \ppp \\ +%% \sigma_1^\shri & \cdd & & A & & & & & & \ppp \\ +%% \vno & \ddd & T_A & \vdd & & & & & & \ppp \\ +%% \vdd & \ddd & & \pcd & \cdd & & B & & & \ppp \\ +%% & & & & & T_B & \vdd & & & \ppp \\ +%% & & & & & & \pcd & \cdd & & C \\ +%% & & & & & & & & T_C & \vdd \\ +%% & & & & & & & \text{\emoji{cross-mark}} & & \\ +%% & & & & & & & & & \\ +%% & & & & & & & & & \\ +%% & & & & & & & & \ppp & \sigma_n^\shup \\ +%% \vno & \cdd & & & & & & & \vno & +%% \end{NiceMatrix} +%% \] +%% \end{minipage} +%% } +%% \hspace{1cm} +%% +%% \caption{By recursing over upper diagonals of decreasing elevation and discarding subtrees that fall under the shadow of another's canopy, we can recover parseable subtrees.} +%% \end{figure} +%%\end{frame} +% +%\begin{frame}[fragile]{Conjunctive parsing} +% It is well-known that the family of CFLs is not closed under intersection. For example, consider $\mathcal{L}_\cap := \mathcal{L}_{\mathcal{G}_1} \cap \mathcal{L}_{\mathcal{G}_2}$: +% +% \begin{table}[H] +% \begin{tabular}{llll} +% $P_1 := \big\{\;S \rightarrow L R,$ & $L \rightarrow a b \mid a L b,$ & $R \rightarrow c \mid c R\;\big\}$\vspace{5pt} \\ +% $P_2 := \big\{\;S \rightarrow L R,$ & $R \rightarrow b c \mid b R c,$ & $L \rightarrow a \mid a L\;\big\}$ +% \end{tabular} +% \end{table} +% +% \noindent Note that $\mathcal{L}_\cap$ generates the language $\big\{\;a^d b^d c^d \mid d > 0\;\big\}$, which according to the pumping lemma is not context-free. To encode $\mathcal{L}_\cap$, we intersect all terminals $\Sigma_\cap := \bigcap_{i=1}^c \Sigma_i$, then for each $t_\cap \in \Sigma_\cap$ and CFG, construct an equivalence class $E(t_\cap, \mathcal{G}_i) = \{ w_i \mid (w_i \rightarrow t_\cap) \in P_i\}$ as follows:\vspace{-5pt} +% +% \begin{align} +% \bigwedge_{t\in\Sigma_\cap}\bigwedge_{j = 1}^{c-1}\bigwedge_{i=1}^{|\sigma|} E(t_{\cap}, \mathcal{G}_j) \equiv_{\sigma_i} E(t_{\cap}, \mathcal{G}_{j+1}) +% \end{align} +% % Generated by cfl4_intersection.vox, open with https://voxelator.com/ +% \begin{figure}[H] +% \includegraphics[height=0.093\textwidth]{../figures/angle1.png}\hspace{-5pt} +% \includegraphics[height=0.093\textwidth]{../figures/angle2.png}\hspace{-5pt} +% \includegraphics[height=0.093\textwidth]{../figures/angle5.png}\hspace{-5pt} +% \includegraphics[height=0.093\textwidth]{../figures/angle3.png}\hspace{-3pt} +% \includegraphics[height=0.093\textwidth]{../figures/angle4.png} +% \end{figure} +%\end{frame} +% +%\begin{frame}[fragile]{An Simple Reachability Proof} +% \begin{lemma} +% For any nonempty language $\ell: \mathcal{L}(\mathcal{G})$ and invalid string $\err{\sigma}: \Sigma^n$, there exists an $(\tilde{\sigma}, m)$ such that $\tilde{\sigma} \in \ell\cap\Sigma^m$ and $0 < \Delta(\err{\sigma}, \ell) \leq \max(m, n) < \infty$, where $\Delta$ denotes the Levenshtein edit distance.\\ +% \end{lemma} +% +% \begin{proof} +% Since $\ell$ is nonempty, it must have at least one inhabitant $\sigma \in \ell$. Let $\tilde{\sigma}$ be the smallest such member. Since $\tilde{\sigma}$ is a valid sentence in $\ell$, by definition it must be that $|\tilde{\sigma}|<\infty$. Let $m:=|\tilde{\sigma}|$. Since we know $\err{\sigma} \notin \ell$, it follows that $0 < \Delta(\err{\sigma}, \ell)$. Let us consider two cases, either $\tilde{\sigma} = \varepsilon$, or $0 < |\tilde{\sigma}|$: +% +% \begin{itemize} +% \item If $\tilde{\sigma} = \varepsilon$, then $\Delta(\err{\sigma}, \tilde{\sigma}) = n$ by full erasure of $\err{\sigma}$, or +% \item If $0 < m$, then $\Delta(\err{\sigma}, \tilde{\sigma}) \leq \max(m, n)$ by overwriting. +% \end{itemize} +% +% In either case, it follows $\Delta(\err{\sigma}, \ell) \leq \max(m, n)$ and $\ell$ is always reachable via a finite nonempty set of Levenshtein edits, i.e., $0 < \Delta(\err{\sigma}, \ell) < \infty$. +% \end{proof} +%\end{frame} +% +%\begin{frame}[fragile]{Probabilistic repair generation} +% \resizebox{\textwidth}{!}{ +% \begin{minipage}{1.4\textwidth} +% \input{prob_reach.tex} +% \end{minipage} +% } +%\end{frame} +% +%\begin{frame}[fragile]{Multicore Scaling Results (aarch64)} +% \begin{tikzpicture} +% \begin{axis}[ +% ybar, +% enlargelimits=0.15, +% legend style={at={(0.03,0.97)},anchor=north west}, +% title={\textbf{Relative Total Distinct Solutions Found vs. Single Core}}, +% x=1cm, +% ylabel={Relative improvement}, +% xlabel={Number of assigned cores}, +% symbolic x coords={2,3,4,5,6,7,8,9,10}, +% xtick=data, +% bar width=3pt, +% ] +% \addplot coordinates { +% (2,0.1343) (3,0.1955) (4,0.2249) (5,0.2475) +% (6,0.2760) (7,0.2994) (8,0.3073) (9,0.3151) (10,0.3229) +% }; +% \addplot coordinates { +% (2,0.2655) (3,0.4353) (4,0.5614) (5,0.6644) +% (6,0.7462) (7,0.7783) (8,0.8347) (9,0.8005) (10,0.7798) +% }; +% \addplot coordinates { +% (2,0.3972) (3,0.6928) (4,0.9327) (5,1.1834) +% (6,1.3138) (7,1.3988) (8,1.6039) (9,1.5500) (10,1.5691) +% }; +% \addplot coordinates { +% (2,0.4863) (3,0.8326) (4,1.1368) (5,1.3879) +% (6,1.5873) (7,1.7494) (8,1.8802) (9,1.9059) (10,1.9625) +% }; +% \addplot coordinates { +% (2,0.4315) (3,0.7583) (4,1.0122) (5,1.2593) +% (6,1.4586) (7,1.6349) (8,1.7813) (9,1.8324) (10,1.8695) +% }; +% \legend{Holes=2,Holes=3,Holes=4,Holes=5,Holes=6} +% \end{axis} +% \end{tikzpicture} +%\end{frame} \end{document} \ No newline at end of file