From b11b55034fdf6c1ca69991577030893303faf8a1 Mon Sep 17 00:00:00 2001 From: newcomb-luke Date: Fri, 25 Feb 2022 09:24:52 -0500 Subject: [PATCH 1/2] Created translation phase 2 code --- src/lexer/mod.rs | 89 +++++++++++++++++++++++++++++++++----- src/lexer/token.rs | 10 ++++- src/lib.rs | 1 + src/main.rs | 9 +++- src/preprocessor/mod.rs | 1 + src/preprocessor/phase2.rs | 75 ++++++++++++++++++++++++++++++++ test.c | 9 +++- 7 files changed, 178 insertions(+), 16 deletions(-) create mode 100644 src/preprocessor/phase2.rs diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index a09e142..63f8901 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -11,6 +11,8 @@ pub type LexResult = Result, ()>; /// Runs the Lexer that takes the input source string and produces a Vec for later preprocessing pub fn lex(session: &Session, input_file: Rc) -> LexResult { + // TODO: Emit warning for attempted nested multi-line comments + let mut tokens = Vec::new(); let source = input_file.src.as_ref().unwrap(); @@ -24,6 +26,11 @@ pub fn lex(session: &Session, input_file: Rc) -> LexResult { // we had an error or not after lexing is complete let mut had_error = false; + // This keeps track of if we are in a multi-line comment, which will have to be removed at this + // stage because it becomes a burden if it must be removed at a later stage, as it does not + // affect any of the actual code + let mut multi_comment_start: Option = None; + while let Some(kind) = lexer.next() { // Gets the slice of the source code that the current token is from let slice = lexer.slice(); @@ -36,20 +43,55 @@ pub fn lex(session: &Session, input_file: Rc) -> LexResult { end: index + slice.len(), }; - if token.kind == PTokenKind::ErrorGeneric { - let text = session.span_to_string(&token.into()).unwrap(); - - session - .struct_error(format!("error lexing token `{}`", text)) - .span_label(token.into(), "invalid token found") - .emit(); - - had_error = true; + if token.kind == PTokenKind::CommentMultiStart { + if let Some(comment_start) = multi_comment_start { + session + .struct_span_warn(comment_start.into(), "`/*` within block comment") + .note("block comments cannot be nested") + .emit(); + } else { + multi_comment_start = Some(token); + } + } else if token.kind == PTokenKind::CommentMultiEnd { + if multi_comment_start.is_some() { + multi_comment_start = None; + } else { + session + .struct_error("unexpected token `*/`") + .span_label(token.into(), "lone block comment terminator") + .emit(); + + had_error = true; + } + } else { + if multi_comment_start.is_none() { + if token.kind == PTokenKind::ErrorGeneric { + let text = session.span_to_string(&token.into()).unwrap(); + + session + .struct_error(format!("error lexing token `{}`", text)) + .span_label(token.into(), "invalid token found") + .emit(); + + had_error = true; + } + + tokens.push(token); + } } index += slice.len(); + } - tokens.push(token); + if let Some(comment_start) = multi_comment_start { + session + .struct_span_error( + comment_start.into(), + "Unterminated block comment begins here", + ) + .emit(); + + had_error = true; } if !had_error { @@ -212,7 +254,7 @@ mod tests { #[test] fn lex_punctuators() { let (sess, src) = dummy_sess( - r#"( ) , [ ] { } . -> ++ -- & * + - ~ ! / % << >> < > <= >= == != ^ | && || ? : ; ... = *= /= %= += -= <<= >>= &= ^= |= # ## <: :> <% %> %: %:%:"#, + r#"( ) , [ ] { } . -> ++ -- & * + - ~ ! / % << >> < > <= >= == != ^ | && || ? : ; ... = *= /= %= += -= <<= >>= &= ^= |= # ## <: :> <% %> %: %:%: \"#, ); let input = super::lex(&sess, src.clone()).unwrap(); @@ -272,6 +314,31 @@ mod tests { (PTokenKind::Punctuator, "%>"), (PTokenKind::Punctuator, "%:"), (PTokenKind::Punctuator, "%:%:"), + (PTokenKind::Backslash, "\\"), + ]; + + check_matches(src, input, reference); + } + + #[test] + fn lex_comments() { + let (sess, src) = dummy_sess( + r#"// This is a single line comment +/* + * This is a multi-line comment + */"#, + ); + + let input = super::lex(&sess, src.clone()).unwrap(); + + // NOTE: Multi-line comments are stripped during lexing, and therefore should not show up + // here + let reference = vec![ + ( + PTokenKind::CommentSingle, + r#"// This is a single line comment"#, + ), + (PTokenKind::Newline, "\n"), ]; check_matches(src, input, reference); diff --git a/src/lexer/token.rs b/src/lexer/token.rs index 972071b..d35202b 100644 --- a/src/lexer/token.rs +++ b/src/lexer/token.rs @@ -66,8 +66,14 @@ pub enum PTokenKind { CommentSingle, /// A multi-line comment - #[regex(r"/\*[^*]*\*+(?:[^/*][^*]*\*+)*/")] - CommentMulti, + #[regex(r"/\*")] + CommentMultiStart, + + #[regex(r"\*/")] + CommentMultiEnd, + + #[token("\\")] + Backslash, /// Any non-newline whitespace, which we can't skip for the single reason that: preprocessor /// operations diff --git a/src/lib.rs b/src/lib.rs index 7bc3307..e79d6eb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,7 @@ #![allow(clippy::result_unit_err)] pub mod diagnostic; pub mod lexer; +pub mod preprocessor; #[cfg(test)] mod tests { diff --git a/src/main.rs b/src/main.rs index d5b5be5..ef4a0ef 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,7 @@ use sacc::{ diagnostic::{session::Session, Handler, HandlerFlags, SourceManager}, lexer::lex, + preprocessor::phase2::phase2, }; use std::{path::Path, process::exit, rc::Rc}; @@ -21,9 +22,13 @@ fn main() { match session.load_file(path) { Ok(root_src) => { + // Lex tokens from our main source if let Ok(tokens) = lex(&session, root_src) { - for token in tokens.iter() { - println!("{:?}", token); + // Run phase 2 of translation, which removes comments and backslashes and newlines + if let Ok(tokens) = phase2(tokens, &session) { + for token in tokens.iter() { + println!("{:?}", token); + } } } } diff --git a/src/preprocessor/mod.rs b/src/preprocessor/mod.rs index e69de29..d7c6dc6 100644 --- a/src/preprocessor/mod.rs +++ b/src/preprocessor/mod.rs @@ -0,0 +1 @@ +pub mod phase2; diff --git a/src/preprocessor/phase2.rs b/src/preprocessor/phase2.rs new file mode 100644 index 0000000..ad18a32 --- /dev/null +++ b/src/preprocessor/phase2.rs @@ -0,0 +1,75 @@ +use crate::{ + diagnostic::session::Session, + lexer::{PToken, PTokenKind}, +}; + +/// Phase 1 according to the C specification is replacing trigraph sequences. Because of the nature +/// of preprocessing tokens, and a distaste of looping through every character before it gets to +/// the lexer, that phase will be postponed as it correctly can be. Therefore phase 2 will come +/// first. +/// +/// According to the C specification, phase 2 consists of: +/// +/// Each instance of a backslash character ( \) immediately followed by a new-line +/// character is deleted, splicing physical source lines to form logical source lines. +/// Only the last backslash on any physical source line shall be eligible for being part +/// of such a splice. A source file that is not empty shall end in a new-line character, +/// which shall not be immediately preceded by a backslash character before any such +/// splicing takes place. +/// +/// Therefore this function removes all newlines following a backslash. Because comments also +/// have no effect on the code generated from C, they are also stripped here. +/// +pub fn phase2(tokens: Vec, session: &Session) -> Result, ()> { + let mut new_tokens = Vec::with_capacity(tokens.capacity()); + + let mut backslash: Option = None; + let mut has_error = false; + + for token in tokens { + if backslash.is_some() { + if token.kind == PTokenKind::Newline { + backslash = None; + } else if token.kind == PTokenKind::Whitespace { + session + .struct_span_warn(token.into(), "whitespace before newline after `\\`") + .emit(); + } else { + // At this point we don't have to worry about other files being included in the + // token stream + let s = session.span_to_string(&token.into()).unwrap(); + + session + .struct_error(format!("found unexpected token `{}`", s)) + .span_label(token.into(), "expected newline after `\\`, found this") + .emit(); + + // We can continue to try, just in case they make the same mistake again? + has_error = true; + backslash = None; + } + } else { + if token.kind != PTokenKind::CommentSingle { + if token.kind == PTokenKind::Backslash { + backslash = Some(token); + } else { + new_tokens.push(token); + } + } + } + } + + if let Some(backslash) = backslash { + session + .struct_error("unexpected end of file") + .span_label(backslash.into(), "after backslash") + .emit(); + has_error = true; + } + + if has_error { + Err(()) + } else { + Ok(new_tokens) + } +} diff --git a/test.c b/test.c index dda5019..df2259c 100644 --- a/test.c +++ b/test.c @@ -3,5 +3,12 @@ int main() { printf("Hello world"); - return 0; + // Single-line comment + + /* + * $$$ test $$$ + */ + + return \ + 0; } From 23989a23d4f601d922c87f68ba3446ad147926f5 Mon Sep 17 00:00:00 2001 From: newcomb-luke Date: Fri, 25 Feb 2022 09:27:44 -0500 Subject: [PATCH 2/2] Fixed clippy warnings on new code --- src/lexer/mod.rs | 24 ++++++++++-------------- src/preprocessor/phase2.rs | 12 +++++------- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 63f8901..e157079 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -11,8 +11,6 @@ pub type LexResult = Result, ()>; /// Runs the Lexer that takes the input source string and produces a Vec for later preprocessing pub fn lex(session: &Session, input_file: Rc) -> LexResult { - // TODO: Emit warning for attempted nested multi-line comments - let mut tokens = Vec::new(); let source = input_file.src.as_ref().unwrap(); @@ -63,21 +61,19 @@ pub fn lex(session: &Session, input_file: Rc) -> LexResult { had_error = true; } - } else { - if multi_comment_start.is_none() { - if token.kind == PTokenKind::ErrorGeneric { - let text = session.span_to_string(&token.into()).unwrap(); - - session - .struct_error(format!("error lexing token `{}`", text)) - .span_label(token.into(), "invalid token found") - .emit(); + } else if multi_comment_start.is_none() { + if token.kind == PTokenKind::ErrorGeneric { + let text = session.span_to_string(&token.into()).unwrap(); - had_error = true; - } + session + .struct_error(format!("error lexing token `{}`", text)) + .span_label(token.into(), "invalid token found") + .emit(); - tokens.push(token); + had_error = true; } + + tokens.push(token); } index += slice.len(); diff --git a/src/preprocessor/phase2.rs b/src/preprocessor/phase2.rs index ad18a32..0b8d8f4 100644 --- a/src/preprocessor/phase2.rs +++ b/src/preprocessor/phase2.rs @@ -48,13 +48,11 @@ pub fn phase2(tokens: Vec, session: &Session) -> Result, ()> has_error = true; backslash = None; } - } else { - if token.kind != PTokenKind::CommentSingle { - if token.kind == PTokenKind::Backslash { - backslash = Some(token); - } else { - new_tokens.push(token); - } + } else if token.kind != PTokenKind::CommentSingle { + if token.kind == PTokenKind::Backslash { + backslash = Some(token); + } else { + new_tokens.push(token); } } }