From b11b55034fdf6c1ca69991577030893303faf8a1 Mon Sep 17 00:00:00 2001
From: newcomb-luke <newcomb.luke@gmail.com>
Date: Fri, 25 Feb 2022 09:24:52 -0500
Subject: [PATCH 1/2] Created translation phase 2 code

---
 src/lexer/mod.rs           | 89 +++++++++++++++++++++++++++++++++-----
 src/lexer/token.rs         | 10 ++++-
 src/lib.rs                 |  1 +
 src/main.rs                |  9 +++-
 src/preprocessor/mod.rs    |  1 +
 src/preprocessor/phase2.rs | 75 ++++++++++++++++++++++++++++++++
 test.c                     |  9 +++-
 7 files changed, 178 insertions(+), 16 deletions(-)
 create mode 100644 src/preprocessor/phase2.rs
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
index a09e142..63f8901 100644
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@@ -11,6 +11,8 @@ pub type LexResult = Result<Vec<PToken>, ()>;
 
 /// Runs the Lexer that takes the input source string and produces a Vec<PToken> for later preprocessing
 pub fn lex(session: &Session, input_file: Rc<SourceFile>) -> LexResult {
+    // TODO: Emit warning for attempted nested multi-line comments
+
     let mut tokens = Vec::new();
 
     let source = input_file.src.as_ref().unwrap();
@@ -24,6 +26,11 @@ pub fn lex(session: &Session, input_file: Rc<SourceFile>) -> LexResult {
     // we had an error or not after lexing is complete
     let mut had_error = false;
 
+    // This keeps track of if we are in a multi-line comment, which will have to be removed at this
+    // stage because it becomes a burden if it must be removed at a later stage, as it does not
+    // affect any of the actual code
+    let mut multi_comment_start: Option<PToken> = None;
+
     while let Some(kind) = lexer.next() {
         // Gets the slice of the source code that the current token is from
         let slice = lexer.slice();
@@ -36,20 +43,55 @@ pub fn lex(session: &Session, input_file: Rc<SourceFile>) -> LexResult {
             end: index + slice.len(),
         };
 
-        if token.kind == PTokenKind::ErrorGeneric {
-            let text = session.span_to_string(&token.into()).unwrap();
-
-            session
-                .struct_error(format!("error lexing token `{}`", text))
-                .span_label(token.into(), "invalid token found")
-                .emit();
-
-            had_error = true;
+        if token.kind == PTokenKind::CommentMultiStart {
+            if let Some(comment_start) = multi_comment_start {
+                session
+                    .struct_span_warn(comment_start.into(), "`/*` within block comment")
+                    .note("block comments cannot be nested")
+                    .emit();
+            } else {
+                multi_comment_start = Some(token);
+            }
+        } else if token.kind == PTokenKind::CommentMultiEnd {
+            if multi_comment_start.is_some() {
+                multi_comment_start = None;
+            } else {
+                session
+                    .struct_error("unexpected token `*/`")
+                    .span_label(token.into(), "lone block comment terminator")
+                    .emit();
+
+                had_error = true;
+            }
+        } else {
+            if multi_comment_start.is_none() {
+                if token.kind == PTokenKind::ErrorGeneric {
+                    let text = session.span_to_string(&token.into()).unwrap();
+
+                    session
+                        .struct_error(format!("error lexing token `{}`", text))
+                        .span_label(token.into(), "invalid token found")
+                        .emit();
+
+                    had_error = true;
+                }
+
+                tokens.push(token);
+            }
         }
 
         index += slice.len();
+    }
 
-        tokens.push(token);
+    if let Some(comment_start) = multi_comment_start {
+        session
+            .struct_span_error(
+                comment_start.into(),
+                "Unterminated block comment begins here",
+            )
+            .emit();
+
+        had_error = true;
     }
 
     if !had_error {
@@ -212,7 +254,7 @@ mod tests {
     #[test]
     fn lex_punctuators() {
         let (sess, src) = dummy_sess(
-            r#"( ) , [ ] { } . -> ++ -- & * + - ~ ! / % << >> < > <= >= == != ^ | && || ? : ; ... = *= /= %= += -= <<= >>= &= ^= |= # ## <: :> <% %> %: %:%:"#,
+            r#"( ) , [ ] { } . -> ++ -- & * + - ~ ! / % << >> < > <= >= == != ^ | && || ? : ; ... = *= /= %= += -= <<= >>= &= ^= |= # ## <: :> <% %> %: %:%: \"#,
         );
 
         let input = super::lex(&sess, src.clone()).unwrap();
@@ -272,6 +314,31 @@ mod tests {
             (PTokenKind::Punctuator, "%>"),
             (PTokenKind::Punctuator, "%:"),
             (PTokenKind::Punctuator, "%:%:"),
+            (PTokenKind::Backslash, "\\"),
+        ];
+
+        check_matches(src, input, reference);
+    }
+
+    #[test]
+    fn lex_comments() {
+        let (sess, src) = dummy_sess(
+            r#"// This is a single line comment
+/*
+ * This is a multi-line comment
+ */"#,
+        );
+
+        let input = super::lex(&sess, src.clone()).unwrap();
+
+        // NOTE: Multi-line comments are stripped during lexing, and therefore should not show up
+        // here
+        let reference = vec![
+            (
+                PTokenKind::CommentSingle,
+                r#"// This is a single line comment"#,
+            ),
+            (PTokenKind::Newline, "\n"),
         ];
 
         check_matches(src, input, reference);
diff --git a/src/lexer/token.rs b/src/lexer/token.rs
index 972071b..d35202b 100644
--- a/src/lexer/token.rs
+++ b/src/lexer/token.rs
@@ -66,8 +66,14 @@ pub enum PTokenKind {
     CommentSingle,
 
     /// A multi-line comment
-    #[regex(r"/\*[^*]*\*+(?:[^/*][^*]*\*+)*/")]
-    CommentMulti,
+    #[regex(r"/\*")]
+    CommentMultiStart,
+
+    #[regex(r"\*/")]
+    CommentMultiEnd,
+
+    #[token("\\")]
+    Backslash,
 
     /// Any non-newline whitespace, which we can't skip for the single reason that: preprocessor
     /// operations
diff --git a/src/lib.rs b/src/lib.rs
index 7bc3307..e79d6eb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,6 +1,7 @@
 #![allow(clippy::result_unit_err)]
 pub mod diagnostic;
 pub mod lexer;
+pub mod preprocessor;
 
 #[cfg(test)]
 mod tests {
diff --git a/src/main.rs b/src/main.rs
index d5b5be5..ef4a0ef 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,6 +1,7 @@
 use sacc::{
     diagnostic::{session::Session, Handler, HandlerFlags, SourceManager},
     lexer::lex,
+    preprocessor::phase2::phase2,
 };
 use std::{path::Path, process::exit, rc::Rc};
 
@@ -21,9 +22,13 @@ fn main() {
 
     match session.load_file(path) {
         Ok(root_src) => {
+            // Lex tokens from our main source
             if let Ok(tokens) = lex(&session, root_src) {
-                for token in tokens.iter() {
-                    println!("{:?}", token);
+                // Run phase 2 of translation, which removes comments and backslashes and newlines
+                if let Ok(tokens) = phase2(tokens, &session) {
+                    for token in tokens.iter() {
+                        println!("{:?}", token);
+                    }
                 }
             }
         }
diff --git a/src/preprocessor/mod.rs b/src/preprocessor/mod.rs
index e69de29..d7c6dc6 100644
--- a/src/preprocessor/mod.rs
+++ b/src/preprocessor/mod.rs
@@ -0,0 +1 @@
+pub mod phase2;
diff --git a/src/preprocessor/phase2.rs b/src/preprocessor/phase2.rs
new file mode 100644
index 0000000..ad18a32
--- /dev/null
+++ b/src/preprocessor/phase2.rs
@@ -0,0 +1,75 @@
+use crate::{
+    diagnostic::session::Session,
+    lexer::{PToken, PTokenKind},
+};
+
+/// Phase 1 according to the C specification is replacing trigraph sequences. Because of the nature
+/// of preprocessing tokens, and a distaste of looping through every character before it gets to
+/// the lexer, that phase will be postponed as it correctly can be. Therefore phase 2 will come
+/// first.
+///
+/// According to the C specification, phase 2 consists of:
+///
+/// Each instance of a backslash character ( \) immediately followed by a new-line
+/// character is deleted, splicing physical source lines to form logical source lines.
+/// Only the last backslash on any physical source line shall be eligible for being part
+/// of such a splice. A source file that is not empty shall end in a new-line character,
+/// which shall not be immediately preceded by a backslash character before any such
+/// splicing takes place.
+///
+/// Therefore this function removes all newlines following a backslash. Because comments also
+/// have no effect on the code generated from C, they are also stripped here.
+///
+pub fn phase2(tokens: Vec<PToken>, session: &Session) -> Result<Vec<PToken>, ()> {
+    let mut new_tokens = Vec::with_capacity(tokens.capacity());
+
+    let mut backslash: Option<PToken> = None;
+    let mut has_error = false;
+
+    for token in tokens {
+        if backslash.is_some() {
+            if token.kind == PTokenKind::Newline {
+                backslash = None;
+            } else if token.kind == PTokenKind::Whitespace {
+                session
+                    .struct_span_warn(token.into(), "whitespace before newline after `\\`")
+                    .emit();
+            } else {
+                // At this point we don't have to worry about other files being included in the
+                // token stream
+                let s = session.span_to_string(&token.into()).unwrap();
+
+                session
+                    .struct_error(format!("found unexpected token `{}`", s))
+                    .span_label(token.into(), "expected newline after `\\`, found this")
+                    .emit();
+
+                // We can continue to try, just in case they make the same mistake again?
+                has_error = true;
+                backslash = None;
+            }
+        } else {
+            if token.kind != PTokenKind::CommentSingle {
+                if token.kind == PTokenKind::Backslash {
+                    backslash = Some(token);
+                } else {
+                    new_tokens.push(token);
+                }
+            }
+        }
+    }
+
+    if let Some(backslash) = backslash {
+        session
+            .struct_error("unexpected end of file")
+            .span_label(backslash.into(), "after backslash")
+            .emit();
+        has_error = true;
+    }
+
+    if has_error {
+        Err(())
+    } else {
+        Ok(new_tokens)
+    }
+}
diff --git a/test.c b/test.c
index dda5019..df2259c 100644
--- a/test.c
+++ b/test.c
@@ -3,5 +3,12 @@
 int main() {
 	printf("Hello world");
 
-	return 0;
+	// Single-line comment
+
+	/*
+	 * $$$ test $$$
+	 */
+
+	return \
+		0;
 }

From 23989a23d4f601d922c87f68ba3446ad147926f5 Mon Sep 17 00:00:00 2001
From: newcomb-luke <newcomb.luke@gmail.com>
Date: Fri, 25 Feb 2022 09:27:44 -0500
Subject: [PATCH 2/2] Fixed clippy warnings on new code

---
 src/lexer/mod.rs           | 24 ++++++++++--------------
 src/preprocessor/phase2.rs | 12 +++++-------
 2 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
index 63f8901..e157079 100644
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@@ -11,8 +11,6 @@ pub type LexResult = Result<Vec<PToken>, ()>;
 
 /// Runs the Lexer that takes the input source string and produces a Vec<PToken> for later preprocessing
 pub fn lex(session: &Session, input_file: Rc<SourceFile>) -> LexResult {
-    // TODO: Emit warning for attempted nested multi-line comments
-
     let mut tokens = Vec::new();
 
     let source = input_file.src.as_ref().unwrap();
@@ -63,21 +61,19 @@ pub fn lex(session: &Session, input_file: Rc<SourceFile>) -> LexResult {
 
                 had_error = true;
             }
-        } else {
-            if multi_comment_start.is_none() {
-                if token.kind == PTokenKind::ErrorGeneric {
-                    let text = session.span_to_string(&token.into()).unwrap();
-
-                    session
-                        .struct_error(format!("error lexing token `{}`", text))
-                        .span_label(token.into(), "invalid token found")
-                        .emit();
+        } else if multi_comment_start.is_none() {
+            if token.kind == PTokenKind::ErrorGeneric {
+                let text = session.span_to_string(&token.into()).unwrap();
 
-                    had_error = true;
-                }
+                session
+                    .struct_error(format!("error lexing token `{}`", text))
+                    .span_label(token.into(), "invalid token found")
+                    .emit();
 
-                tokens.push(token);
+                had_error = true;
             }
+
+            tokens.push(token);
         }
 
         index += slice.len();
diff --git a/src/preprocessor/phase2.rs b/src/preprocessor/phase2.rs
index ad18a32..0b8d8f4 100644
--- a/src/preprocessor/phase2.rs
+++ b/src/preprocessor/phase2.rs
@@ -48,13 +48,11 @@ pub fn phase2(tokens: Vec<PToken>, session: &Session) -> Result<Vec<PToken>, ()>
                 has_error = true;
                 backslash = None;
             }
-        } else {
-            if token.kind != PTokenKind::CommentSingle {
-                if token.kind == PTokenKind::Backslash {
-                    backslash = Some(token);
-                } else {
-                    new_tokens.push(token);
-                }
+        } else if token.kind != PTokenKind::CommentSingle {
+            if token.kind == PTokenKind::Backslash {
+                backslash = Some(token);
+            } else {
+                new_tokens.push(token);
             }
         }
     }