From 79e281007d7c1f794112edffde8c412bdcfe7318 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Fri, 4 Oct 2024 23:05:00 +0200 Subject: [PATCH 01/13] fix: save --- crates/pg_lexer/src/lib.rs | 6 +- crates/pg_statement_splitter/src/data.rs | 14 + .../src/is_at_stmt_start.rs | 1015 ----------------- crates/pg_statement_splitter/src/lib.rs | 138 +-- crates/pg_statement_splitter/src/parser.rs | 149 ++- crates/pg_statement_splitter/src/split.rs | 148 +++ 6 files changed, 261 insertions(+), 1209 deletions(-) create mode 100644 crates/pg_statement_splitter/src/data.rs delete mode 100644 crates/pg_statement_splitter/src/is_at_stmt_start.rs create mode 100644 crates/pg_statement_splitter/src/split.rs diff --git a/crates/pg_lexer/src/lib.rs b/crates/pg_lexer/src/lib.rs index ece57fb3c..df24f8d85 100644 --- a/crates/pg_lexer/src/lib.rs +++ b/crates/pg_lexer/src/lib.rs @@ -65,7 +65,7 @@ static PATTERN_LEXER: LazyLock = fn whitespace_tokens(input: &str) -> VecDeque { let mut tokens = VecDeque::new(); - for cap in PATTERN_LEXER.captures_iter(&input) { + for cap in PATTERN_LEXER.captures_iter(input) { if let Some(whitespace) = cap.name("whitespace") { tokens.push_back(Token { token_type: TokenType::Whitespace, @@ -139,8 +139,8 @@ pub fn lex(text: &str) -> Vec { kind: SyntaxKind::from(&pg_query_token), text: token_text, span: TextRange::new( - TextSize::try_from(u32::try_from(pg_query_token.start).unwrap()).unwrap(), - TextSize::try_from(u32::try_from(pg_query_token.end).unwrap()).unwrap(), + TextSize::from(u32::try_from(pg_query_token.start).unwrap()), + TextSize::from(u32::try_from(pg_query_token.end).unwrap()), ), }); pos += len; diff --git a/crates/pg_statement_splitter/src/data.rs b/crates/pg_statement_splitter/src/data.rs new file mode 100644 index 000000000..87bd86734 --- /dev/null +++ b/crates/pg_statement_splitter/src/data.rs @@ -0,0 +1,14 @@ +use pg_lexer::SyntaxKind; + +pub static STATEMENT_START_TOKENS: &[SyntaxKind] = &[ + SyntaxKind::With, + SyntaxKind::Select, + SyntaxKind::Insert, + SyntaxKind::Update, + SyntaxKind::DeleteP, + SyntaxKind::Create, +]; + +pub(crate) fn at_statement_start(kind: SyntaxKind) -> bool { + STATEMENT_START_TOKENS.contains(&kind) +} diff --git a/crates/pg_statement_splitter/src/is_at_stmt_start.rs b/crates/pg_statement_splitter/src/is_at_stmt_start.rs deleted file mode 100644 index ec1b83ea1..000000000 --- a/crates/pg_statement_splitter/src/is_at_stmt_start.rs +++ /dev/null @@ -1,1015 +0,0 @@ -use std::collections::HashMap; -use std::sync::LazyLock; - -use super::Parser; -use pg_lexer::SyntaxKind; - -pub enum SyntaxToken { - Required(SyntaxKind), - Optional(SyntaxKind), -} - -#[derive(Debug, Clone, Hash)] -pub enum TokenStatement { - // The respective token is the last token of the statement - EoS(SyntaxKind), - Any(SyntaxKind), -} - -impl TokenStatement { - fn is_eos(&self) -> bool { - match self { - TokenStatement::EoS(_) => true, - _ => false, - } - } - - fn kind(&self) -> SyntaxKind { - match self { - TokenStatement::EoS(k) => k.to_owned(), - TokenStatement::Any(k) => k.to_owned(), - } - } -} - -impl PartialEq for TokenStatement { - fn eq(&self, other: &Self) -> bool { - let a = match self { - TokenStatement::EoS(s) => s, - TokenStatement::Any(s) => s, - }; - - let b = match other { - TokenStatement::EoS(s) => s, - TokenStatement::Any(s) => s, - }; - - return a == b; - } -} - -// vector of hashmaps, where each hashmap returns the list of possible statements for a token at -// the respective index. -// -// For example, at idx 0, the hashmap contains a superset of -// ``` -//{ -// Create: [ -// IndexStmt, -// CreateFunctionStmt, -// CreateStmt, -// ViewStmt, -// ], -// Select: [ -// SelectStmt, -// ], -// }, -// ``` -// -// the idea is to trim down the possible options for each token, until only one statement is left. -// -// The vector is lazily constructed out of another vector of tuples, where each tuple contains a -// statement, and a list of `SyntaxToken`s that are to be found at the start of the statement. -pub static STATEMENT_START_TOKEN_MAPS: LazyLock>>> = - LazyLock::new(|| { - let mut m: Vec<(SyntaxKind, &'static [SyntaxToken])> = Vec::new(); - - m.push(( - SyntaxKind::InsertStmt, - &[ - SyntaxToken::Required(SyntaxKind::Insert), - SyntaxToken::Required(SyntaxKind::Into), - ], - )); - - m.push(( - SyntaxKind::DeleteStmt, - &[ - SyntaxToken::Required(SyntaxKind::DeleteP), - SyntaxToken::Required(SyntaxKind::From), - ], - )); - - m.push(( - SyntaxKind::UpdateStmt, - &[SyntaxToken::Required(SyntaxKind::Update)], - )); - - m.push(( - SyntaxKind::MergeStmt, - &[ - SyntaxToken::Required(SyntaxKind::Merge), - SyntaxToken::Required(SyntaxKind::Into), - ], - )); - - m.push(( - SyntaxKind::SelectStmt, - &[SyntaxToken::Required(SyntaxKind::Select)], - )); - - m.push(( - SyntaxKind::AlterTableStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::Table), - SyntaxToken::Optional(SyntaxKind::IfP), - SyntaxToken::Optional(SyntaxKind::Exists), - SyntaxToken::Optional(SyntaxKind::Only), - SyntaxToken::Required(SyntaxKind::Ident), - ], - )); - - // ALTER TABLE x RENAME ... is different to e.g. alter table alter column... - m.push(( - SyntaxKind::RenameStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::Table), - SyntaxToken::Optional(SyntaxKind::IfP), - SyntaxToken::Optional(SyntaxKind::Exists), - SyntaxToken::Optional(SyntaxKind::Only), - SyntaxToken::Required(SyntaxKind::Ident), - SyntaxToken::Required(SyntaxKind::Rename), - ], - )); - - m.push(( - SyntaxKind::AlterDomainStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::DomainP), - ], - )); - - m.push(( - SyntaxKind::AlterDefaultPrivilegesStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::Default), - SyntaxToken::Required(SyntaxKind::Privileges), - ], - )); - - m.push(( - SyntaxKind::ClusterStmt, - &[SyntaxToken::Required(SyntaxKind::Cluster)], - )); - - m.push(( - SyntaxKind::CopyStmt, - &[SyntaxToken::Required(SyntaxKind::Copy)], - )); - - // CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE - // this is overly simplified, but it should be good enough for now - m.push(( - SyntaxKind::CreateStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Optional(SyntaxKind::Global), - SyntaxToken::Optional(SyntaxKind::Local), - SyntaxToken::Optional(SyntaxKind::Temporary), - SyntaxToken::Optional(SyntaxKind::Temp), - SyntaxToken::Optional(SyntaxKind::Unlogged), - SyntaxToken::Optional(SyntaxKind::IfP), - SyntaxToken::Optional(SyntaxKind::Not), - SyntaxToken::Optional(SyntaxKind::Exists), - SyntaxToken::Required(SyntaxKind::Table), - SyntaxToken::Required(SyntaxKind::Ident), - ], - )); - - // CREATE [ OR REPLACE ] AGGREGATE - m.push(( - SyntaxKind::DefineStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Optional(SyntaxKind::Or), - SyntaxToken::Optional(SyntaxKind::Replace), - SyntaxToken::Required(SyntaxKind::Aggregate), - ], - )); - - // CREATE OPERATOR - m.push(( - SyntaxKind::DefineStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::Operator), - ], - )); - - // CREATE TYPE name - m.push(( - SyntaxKind::DefineStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::TypeP), - SyntaxToken::Required(SyntaxKind::Ident), - ], - )); - - // CREATE TYPE name AS - m.push(( - SyntaxKind::CompositeTypeStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::TypeP), - SyntaxToken::Required(SyntaxKind::Ident), - SyntaxToken::Required(SyntaxKind::As), - ], - )); - - // CREATE TYPE name AS ENUM - m.push(( - SyntaxKind::CreateEnumStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::TypeP), - SyntaxToken::Required(SyntaxKind::Ident), - SyntaxToken::Required(SyntaxKind::As), - SyntaxToken::Required(SyntaxKind::EnumP), - ], - )); - - // CREATE TYPE name AS RANGE - m.push(( - SyntaxKind::CreateRangeStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::TypeP), - SyntaxToken::Required(SyntaxKind::Ident), - SyntaxToken::Required(SyntaxKind::As), - SyntaxToken::Required(SyntaxKind::Range), - ], - )); - - // m.push(( - // SyntaxKind::DropStmt, - // &[ - // SyntaxToken::Required(SyntaxKind::Drop), - // ], - // )); - - m.push(( - SyntaxKind::TruncateStmt, - &[SyntaxToken::Required(SyntaxKind::Truncate)], - )); - - m.push(( - SyntaxKind::CommentStmt, - &[ - SyntaxToken::Required(SyntaxKind::Comment), - SyntaxToken::Required(SyntaxKind::On), - ], - )); - - m.push(( - SyntaxKind::FetchStmt, - &[SyntaxToken::Required(SyntaxKind::Fetch)], - )); - - // CREATE [ UNIQUE ] INDEX - m.push(( - SyntaxKind::IndexStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Optional(SyntaxKind::Unique), - SyntaxToken::Required(SyntaxKind::Index), - ], - )); - - // CREATE [ OR REPLACE ] FUNCTION - m.push(( - SyntaxKind::CreateFunctionStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Optional(SyntaxKind::Or), - SyntaxToken::Optional(SyntaxKind::Replace), - SyntaxToken::Required(SyntaxKind::Function), - ], - )); - - m.push(( - SyntaxKind::AlterFunctionStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::Function), - ], - )); - - m.push((SyntaxKind::DoStmt, &[SyntaxToken::Required(SyntaxKind::Do)])); - - // CREATE [ OR REPLACE ] RULE - m.push(( - SyntaxKind::RuleStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Optional(SyntaxKind::Or), - SyntaxToken::Optional(SyntaxKind::Replace), - SyntaxToken::Required(SyntaxKind::Rule), - ], - )); - - m.push(( - SyntaxKind::NotifyStmt, - &[SyntaxToken::Required(SyntaxKind::Notify)], - )); - m.push(( - SyntaxKind::ListenStmt, - &[SyntaxToken::Required(SyntaxKind::Listen)], - )); - m.push(( - SyntaxKind::UnlistenStmt, - &[SyntaxToken::Required(SyntaxKind::Unlisten)], - )); - - // TransactionStmt can be Begin or Commit - m.push(( - SyntaxKind::TransactionStmt, - &[SyntaxToken::Required(SyntaxKind::BeginP)], - )); - m.push(( - SyntaxKind::TransactionStmt, - &[SyntaxToken::Required(SyntaxKind::Commit)], - )); - - // CREATE [ OR REPLACE ] [ TEMP | TEMPORARY ] [ RECURSIVE ] VIEW - // this is overly simplified, but it should be good enough for now - m.push(( - SyntaxKind::ViewStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Optional(SyntaxKind::Or), - SyntaxToken::Optional(SyntaxKind::Replace), - SyntaxToken::Optional(SyntaxKind::Temporary), - SyntaxToken::Optional(SyntaxKind::Temp), - SyntaxToken::Optional(SyntaxKind::Recursive), - SyntaxToken::Required(SyntaxKind::View), - ], - )); - - m.push(( - SyntaxKind::LoadStmt, - &[SyntaxToken::Required(SyntaxKind::Load)], - )); - - m.push(( - SyntaxKind::CreateDomainStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::DomainP), - ], - )); - - m.push(( - SyntaxKind::CreatedbStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::Database), - ], - )); - - m.push(( - SyntaxKind::DropdbStmt, - &[ - SyntaxToken::Required(SyntaxKind::Drop), - SyntaxToken::Required(SyntaxKind::Database), - ], - )); - - m.push(( - SyntaxKind::VacuumStmt, - &[SyntaxToken::Required(SyntaxKind::Vacuum)], - )); - - m.push(( - SyntaxKind::ExplainStmt, - &[SyntaxToken::Required(SyntaxKind::Explain)], - )); - - // CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } ] TABLE AS - // this is overly simplified, but it should be good enough for now - m.push(( - SyntaxKind::CreateTableAsStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Optional(SyntaxKind::Global), - SyntaxToken::Optional(SyntaxKind::Local), - SyntaxToken::Optional(SyntaxKind::Temporary), - SyntaxToken::Optional(SyntaxKind::Temp), - SyntaxToken::Required(SyntaxKind::Table), - SyntaxToken::Required(SyntaxKind::Ident), - SyntaxToken::Required(SyntaxKind::As), - ], - )); - - m.push(( - SyntaxKind::CreateSeqStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Optional(SyntaxKind::Temporary), - SyntaxToken::Optional(SyntaxKind::Temp), - SyntaxToken::Optional(SyntaxKind::Unlogged), - SyntaxToken::Required(SyntaxKind::Sequence), - ], - )); - - m.push(( - SyntaxKind::AlterSeqStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::Sequence), - ], - )); - - m.push(( - SyntaxKind::VariableSetStmt, - &[SyntaxToken::Required(SyntaxKind::Set)], - )); - - m.push(( - SyntaxKind::VariableShowStmt, - &[SyntaxToken::Required(SyntaxKind::Show)], - )); - - m.push(( - SyntaxKind::DiscardStmt, - &[SyntaxToken::Required(SyntaxKind::Discard)], - )); - - // CREATE [ OR REPLACE ] [ CONSTRAINT ] TRIGGER - m.push(( - SyntaxKind::CreateTrigStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Optional(SyntaxKind::Or), - SyntaxToken::Optional(SyntaxKind::Replace), - SyntaxToken::Optional(SyntaxKind::Constraint), - SyntaxToken::Required(SyntaxKind::Trigger), - ], - )); - - m.push(( - SyntaxKind::CreateRoleStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::Role), - ], - )); - - m.push(( - SyntaxKind::AlterRoleStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::Role), - ], - )); - - m.push(( - SyntaxKind::DropRoleStmt, - &[ - SyntaxToken::Required(SyntaxKind::Drop), - SyntaxToken::Required(SyntaxKind::Role), - ], - )); - - m.push(( - SyntaxKind::LockStmt, - &[SyntaxToken::Required(SyntaxKind::LockP)], - )); - - m.push(( - SyntaxKind::ConstraintsSetStmt, - &[ - SyntaxToken::Required(SyntaxKind::Set), - SyntaxToken::Required(SyntaxKind::Constraints), - ], - )); - - m.push(( - SyntaxKind::ReindexStmt, - &[SyntaxToken::Required(SyntaxKind::Reindex)], - )); - - m.push(( - SyntaxKind::CheckPointStmt, - &[SyntaxToken::Required(SyntaxKind::Checkpoint)], - )); - - m.push(( - SyntaxKind::CreateSchemaStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::Schema), - ], - )); - - m.push(( - SyntaxKind::AlterDatabaseStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::Database), - SyntaxToken::Required(SyntaxKind::Ident), - ], - )); - - m.push(( - SyntaxKind::AlterDatabaseRefreshCollStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::Database), - SyntaxToken::Required(SyntaxKind::Ident), - SyntaxToken::Required(SyntaxKind::Refresh), - SyntaxToken::Required(SyntaxKind::Collation), - SyntaxToken::Required(SyntaxKind::VersionP), - ], - )); - - m.push(( - SyntaxKind::AlterDatabaseSetStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::Database), - SyntaxToken::Required(SyntaxKind::Ident), - SyntaxToken::Required(SyntaxKind::Set), - ], - )); - - m.push(( - SyntaxKind::AlterDatabaseSetStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::Database), - SyntaxToken::Required(SyntaxKind::Ident), - SyntaxToken::Required(SyntaxKind::Reset), - ], - )); - - m.push(( - SyntaxKind::CreateConversionStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Optional(SyntaxKind::Default), - SyntaxToken::Required(SyntaxKind::ConversionP), - ], - )); - - m.push(( - SyntaxKind::CreateCastStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::Cast), - ], - )); - - m.push(( - SyntaxKind::CreateOpClassStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::Operator), - SyntaxToken::Required(SyntaxKind::Class), - ], - )); - - m.push(( - SyntaxKind::CreateOpFamilyStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::Operator), - SyntaxToken::Required(SyntaxKind::Family), - ], - )); - - m.push(( - SyntaxKind::AlterOpFamilyStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::Operator), - SyntaxToken::Required(SyntaxKind::Family), - ], - )); - - m.push(( - SyntaxKind::PrepareStmt, - &[SyntaxToken::Required(SyntaxKind::Prepare)], - )); - - // m.push(( - // SyntaxKind::ExecuteStmt, - // &[SyntaxToken::Required(SyntaxKind::Execute)], - // )); - - m.push(( - SyntaxKind::DeallocateStmt, - &[SyntaxToken::Required(SyntaxKind::Deallocate)], - )); - - m.push(( - SyntaxKind::CreateTableSpaceStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::Tablespace), - ], - )); - - m.push(( - SyntaxKind::DropTableSpaceStmt, - &[ - SyntaxToken::Required(SyntaxKind::Drop), - SyntaxToken::Required(SyntaxKind::Tablespace), - ], - )); - - m.push(( - SyntaxKind::AlterOperatorStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::Operator), - ], - )); - - m.push(( - SyntaxKind::AlterTypeStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::TypeP), - ], - )); - - m.push(( - SyntaxKind::DropOwnedStmt, - &[ - SyntaxToken::Required(SyntaxKind::Drop), - SyntaxToken::Required(SyntaxKind::Owned), - SyntaxToken::Required(SyntaxKind::By), - ], - )); - - m.push(( - SyntaxKind::ReassignOwnedStmt, - &[ - SyntaxToken::Required(SyntaxKind::Reassign), - SyntaxToken::Required(SyntaxKind::Owned), - SyntaxToken::Required(SyntaxKind::By), - ], - )); - - m.push(( - SyntaxKind::CreateFdwStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::Foreign), - SyntaxToken::Required(SyntaxKind::DataP), - SyntaxToken::Required(SyntaxKind::Wrapper), - ], - )); - - m.push(( - SyntaxKind::AlterFdwStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::Foreign), - SyntaxToken::Required(SyntaxKind::DataP), - SyntaxToken::Required(SyntaxKind::Wrapper), - ], - )); - - m.push(( - SyntaxKind::CreateForeignServerStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::Server), - ], - )); - - m.push(( - SyntaxKind::AlterForeignServerStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::Server), - ], - )); - - m.push(( - SyntaxKind::CreateUserMappingStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::User), - SyntaxToken::Required(SyntaxKind::Mapping), - ], - )); - - m.push(( - SyntaxKind::AlterUserMappingStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::User), - SyntaxToken::Required(SyntaxKind::Mapping), - SyntaxToken::Required(SyntaxKind::For), - ], - )); - - m.push(( - SyntaxKind::DropUserMappingStmt, - &[ - SyntaxToken::Required(SyntaxKind::Drop), - SyntaxToken::Required(SyntaxKind::User), - SyntaxToken::Required(SyntaxKind::Mapping), - ], - )); - - m.push(( - SyntaxKind::SecLabelStmt, - &[ - SyntaxToken::Required(SyntaxKind::Security), - SyntaxToken::Required(SyntaxKind::Label), - ], - )); - - m.push(( - SyntaxKind::CreateForeignTableStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::Foreign), - SyntaxToken::Required(SyntaxKind::Table), - ], - )); - - m.push(( - SyntaxKind::ImportForeignSchemaStmt, - &[ - SyntaxToken::Required(SyntaxKind::ImportP), - SyntaxToken::Required(SyntaxKind::Foreign), - SyntaxToken::Required(SyntaxKind::Schema), - ], - )); - - m.push(( - SyntaxKind::CreateExtensionStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::Extension), - ], - )); - - m.push(( - SyntaxKind::AlterExtensionStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::Extension), - ], - )); - - m.push(( - SyntaxKind::CreateEventTrigStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::Event), - SyntaxToken::Required(SyntaxKind::Trigger), - ], - )); - - m.push(( - SyntaxKind::AlterEventTrigStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::Event), - SyntaxToken::Required(SyntaxKind::Trigger), - ], - )); - - m.push(( - SyntaxKind::RefreshMatViewStmt, - &[ - SyntaxToken::Required(SyntaxKind::Refresh), - SyntaxToken::Required(SyntaxKind::Materialized), - SyntaxToken::Required(SyntaxKind::View), - ], - )); - - m.push(( - SyntaxKind::AlterSystemStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::SystemP), - ], - )); - - m.push(( - SyntaxKind::CreatePolicyStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::Policy), - ], - )); - - m.push(( - SyntaxKind::AlterPolicyStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::Policy), - ], - )); - - m.push(( - SyntaxKind::CreateTransformStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Optional(SyntaxKind::Or), - SyntaxToken::Optional(SyntaxKind::Replace), - SyntaxToken::Required(SyntaxKind::Transform), - SyntaxToken::Required(SyntaxKind::For), - ], - )); - - m.push(( - SyntaxKind::CreateAmStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::Access), - SyntaxToken::Required(SyntaxKind::Method), - ], - )); - - m.push(( - SyntaxKind::CreatePublicationStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::Publication), - ], - )); - - m.push(( - SyntaxKind::AlterPublicationStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::Publication), - ], - )); - - m.push(( - SyntaxKind::CreateSubscriptionStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::Subscription), - ], - )); - - m.push(( - SyntaxKind::AlterSubscriptionStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::Subscription), - ], - )); - - m.push(( - SyntaxKind::DropSubscriptionStmt, - &[ - SyntaxToken::Required(SyntaxKind::Drop), - SyntaxToken::Required(SyntaxKind::Subscription), - ], - )); - - m.push(( - SyntaxKind::CreateStatsStmt, - &[ - SyntaxToken::Required(SyntaxKind::Create), - SyntaxToken::Required(SyntaxKind::Statistics), - ], - )); - - m.push(( - SyntaxKind::AlterCollationStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::Collation), - ], - )); - - m.push(( - SyntaxKind::CallStmt, - &[SyntaxToken::Required(SyntaxKind::Call)], - )); - - m.push(( - SyntaxKind::AlterStatsStmt, - &[ - SyntaxToken::Required(SyntaxKind::Alter), - SyntaxToken::Required(SyntaxKind::Statistics), - ], - )); - - let mut vec: Vec>> = Vec::new(); - - m.iter().for_each(|(statement, tokens)| { - let mut left_pull: usize = 0; - tokens.iter().enumerate().for_each(|(idx, token)| { - if vec.len() <= idx { - vec.push(HashMap::new()); - } - - let is_last = idx == tokens.len() - 1; - - match token { - SyntaxToken::Required(t) => { - for i in (idx - left_pull)..(idx + 1) { - let list_entry = vec[i].entry(t.to_owned()); - list_entry - .and_modify(|list| { - list.push(if is_last { - TokenStatement::EoS(statement.to_owned()) - } else { - TokenStatement::Any(statement.to_owned()) - }); - }) - .or_insert(vec![if is_last { - TokenStatement::EoS(statement.to_owned()) - } else { - TokenStatement::Any(statement.to_owned()) - }]); - } - } - SyntaxToken::Optional(t) => { - if is_last { - panic!("Optional token cannot be last token"); - } - for i in (idx - left_pull)..(idx + 1) { - let list_entry = vec[i].entry(t.to_owned()); - list_entry - .and_modify(|list| { - list.push(TokenStatement::Any(statement.to_owned())); - }) - .or_insert(vec![TokenStatement::Any(statement.to_owned())]); - } - left_pull += 1; - } - } - }); - }); - - vec - }); - -// TODO: complete the hashmap above with all statements: -// RETURN statement (inside SQL function body) -// ReturnStmt, -// SetOperationStmt, -// -// TODO: parsing ambiguity, check docs for solution -// GrantStmt(super::GrantStmt), -// GrantRoleStmt(super::GrantRoleStmt), -// ClosePortalStmt, -// CreatePlangStmt, -// AlterRoleSetStmt, -// DeclareCursorStmt, -// AlterObjectDependsStmt, -// AlterObjectSchemaStmt, -// AlterOwnerStmt, -// AlterEnumStmt, -// AlterTsdictionaryStmt, -// AlterTsconfigurationStmt, -// AlterTableSpaceOptionsStmt, -// AlterTableMoveAllStmt, -// AlterExtensionContentsStmt, -// ReplicaIdentityStmt, -// - -/// Returns the statement at which the parser is currently at, if any -pub fn is_at_stmt_start(parser: &mut Parser) -> Option { - let mut options = Vec::new(); - for i in 0..STATEMENT_START_TOKEN_MAPS.len() { - // important, else infinite loop: only ignore whitespaces after first token - let token = parser.nth(i, i != 0).kind; - if let Some(result) = STATEMENT_START_TOKEN_MAPS[i].get(&token) { - if i == 0 { - options = result.clone(); - } else { - options = result - .iter() - .filter(|o| options.contains(o)) - .cloned() - .collect(); - } - } else if options.len() > 1 { - // no result is found, and there is currently more than one option - // filter the options for all statements that are complete at this point - options.retain(|o| o.is_eos()); - } - - if options.len() == 0 { - break; - } else if options.len() == 1 && options.get(0).unwrap().is_eos() { - break; - } - } - if options.len() == 0 { - None - } else if options.len() == 1 && options.get(0).unwrap().is_eos() { - Some(options.get(0).unwrap().kind()) - } else { - panic!("Ambiguous statement"); - } -} diff --git a/crates/pg_statement_splitter/src/lib.rs b/crates/pg_statement_splitter/src/lib.rs index adaea4751..08d903055 100644 --- a/crates/pg_statement_splitter/src/lib.rs +++ b/crates/pg_statement_splitter/src/lib.rs @@ -1,118 +1,21 @@ ///! Postgres Statement Splitter ///! ///! This crate provides a function to split a SQL source string into individual statements. -///! -///! TODO: -///! Instead of relying on statement start tokens, we need to include as many tokens as -///! possible. For example, a `CREATE TRIGGER` statement includes an `EXECUTE [ PROCEDURE | -///! FUNCTION ]` clause, but `EXECUTE` is also a statement start token for an `EXECUTE` statement. -/// We should expand the definition map to include an `Any*`, which must be followed by at least -/// one required token and allows the parser to search for the end tokens of the statement. This -/// will hopefully be enough to reduce collisions to zero. -mod is_at_stmt_start; + +mod data; +mod split; mod parser; mod syntax_error; -use is_at_stmt_start::{is_at_stmt_start, TokenStatement, STATEMENT_START_TOKEN_MAPS}; - use parser::{Parse, Parser}; -use pg_lexer::{lex, SyntaxKind}; +use pg_lexer::{lex}; +use split::parse_source; pub fn split(sql: &str) -> Parse { let mut parser = Parser::new(lex(sql)); - while !parser.eof() { - match is_at_stmt_start(&mut parser) { - Some(stmt) => { - parser.start_stmt(); - - // advance over all start tokens of the statement - for i in 0..STATEMENT_START_TOKEN_MAPS.len() { - parser.eat_whitespace(); - let token = parser.nth(0, false); - if let Some(result) = STATEMENT_START_TOKEN_MAPS[i].get(&token.kind) { - let is_in_results = result - .iter() - .find(|x| match x { - TokenStatement::EoS(y) | TokenStatement::Any(y) => y == &stmt, - }) - .is_some(); - if i == 0 && !is_in_results { - panic!("Expected statement start"); - } else if is_in_results { - parser.expect(token.kind); - } else { - break; - } - } - } - - // move until the end of the statement, or until the next statement start - let mut is_sub_stmt = 0; - let mut is_sub_trx = 0; - let mut ignore_next_non_whitespace = false; - while !parser.at(SyntaxKind::Ascii59) && !parser.eof() { - match parser.nth(0, false).kind { - SyntaxKind::All => { - // ALL is never a statement start, but needs to be skipped when combining queries - // (e.g. UNION ALL) - parser.advance(); - } - SyntaxKind::BeginP => { - // BEGIN, consume until END - is_sub_trx += 1; - parser.advance(); - } - SyntaxKind::EndP => { - is_sub_trx -= 1; - parser.advance(); - } - // opening brackets "(", consume until closing bracket ")" - SyntaxKind::Ascii40 => { - is_sub_stmt += 1; - parser.advance(); - } - SyntaxKind::Ascii41 => { - is_sub_stmt -= 1; - parser.advance(); - } - SyntaxKind::As - | SyntaxKind::Union - | SyntaxKind::Intersect - | SyntaxKind::Except => { - // ignore the next non-whitespace token - ignore_next_non_whitespace = true; - parser.advance(); - } - _ => { - // if another stmt FIRST is encountered, break - // ignore if parsing sub stmt - if ignore_next_non_whitespace == false - && is_sub_stmt == 0 - && is_sub_trx == 0 - && is_at_stmt_start(&mut parser).is_some() - { - break; - } else { - if ignore_next_non_whitespace == true && !parser.at_whitespace() { - ignore_next_non_whitespace = false; - } - parser.advance(); - } - } - } - } - - parser.expect(SyntaxKind::Ascii59); - - parser.close_stmt(); - } - None => { - parser.advance(); - } - } - } + parse_source(&mut parser); parser.finish() } @@ -123,15 +26,32 @@ mod tests { #[test] fn test_splitter() { - let input = "select 1 from contact;\nselect 1;\nalter table test drop column id;"; + let input = "select 1 from contact;\nselect 1;"; let res = split(input); - assert_eq!(res.ranges.len(), 3); + assert_eq!(res.ranges.len(), 2); assert_eq!("select 1 from contact;", input[res.ranges[0]].to_string()); assert_eq!("select 1;", input[res.ranges[1]].to_string()); - assert_eq!( - "alter table test drop column id;", - input[res.ranges[2]].to_string() - ); + } + + #[test] + fn test_splitter_no_semicolons() { + let input = "select 1 from contact\nselect 1"; + + let res = split(input); + assert_eq!(res.ranges.len(), 2); + assert_eq!("select 1 from contact", input[res.ranges[0]].to_string()); + assert_eq!("select 1", input[res.ranges[1]].to_string()); + } + + #[test] + fn test_splitter_double_newlines() { + let input = "select 1 from contact\nselect 1\n\nalter table t add column c int"; + + let res = split(input); + assert_eq!(res.ranges.len(), 3); + assert_eq!("select 1 from contact", input[res.ranges[0]].to_string()); + assert_eq!("select 1", input[res.ranges[1]].to_string()); + assert_eq!("alter table t add column c int", input[res.ranges[2]].to_string()); } } diff --git a/crates/pg_statement_splitter/src/parser.rs b/crates/pg_statement_splitter/src/parser.rs index 1b3d0f8bc..c7468e873 100644 --- a/crates/pg_statement_splitter/src/parser.rs +++ b/crates/pg_statement_splitter/src/parser.rs @@ -57,43 +57,41 @@ impl Parser { let text_start = from.unwrap().span.start(); let text_end = to.unwrap().span.end(); - TextRange::new( - TextSize::try_from(text_start).unwrap(), - TextSize::try_from(text_end).unwrap(), - ) + TextRange::new(text_start, text_end) }) .collect(), errors: self.errors, } } + /// Start statement at last non-whitespace token pub fn start_stmt(&mut self) { assert!(self.current_stmt_start.is_none()); - self.current_stmt_start = Some(self.pos); + + if let Some(whitespace_token_buffer) = self.whitespace_token_buffer { + self.current_stmt_start = Some(whitespace_token_buffer); + } else { + self.current_stmt_start = Some(self.pos); + } } + /// Close statement at last non-whitespace token pub fn close_stmt(&mut self) { assert!(self.current_stmt_start.is_some()); - self.ranges - .push((self.current_stmt_start.take().unwrap(), self.pos)); - } - /// collects an SyntaxError with an `error` message at `pos` - pub fn error_at_pos(&mut self, error: String, pos: usize) { - self.errors.push(SyntaxError::new_at_offset( - error, - self.tokens - .get(min(self.tokens.len() - 1, pos)) - .unwrap() - .span - .start(), + self.ranges.push(( + self.current_stmt_start.unwrap(), + self.whitespace_token_buffer.unwrap_or(self.pos), )); + + self.current_stmt_start = None; } /// applies token and advances pub fn advance(&mut self) { assert!(!self.eof()); - if self.nth(0, false).kind == SyntaxKind::Whitespace { + + if self.nth(0).kind == SyntaxKind::Whitespace { if self.whitespace_token_buffer.is_none() { self.whitespace_token_buffer = Some(self.pos); } @@ -103,19 +101,10 @@ impl Parser { self.pos += 1; } - /// flush token buffer and applies all tokens - pub fn flush_token_buffer(&mut self) { - if self.whitespace_token_buffer.is_none() { - return; - } - while self.whitespace_token_buffer.unwrap() < self.pos { - self.whitespace_token_buffer = Some(self.whitespace_token_buffer.unwrap() + 1); - } - self.whitespace_token_buffer = None; - } - + /// checks if the current token is of `kind` and advances if true + /// returns true if the current token is of `kind` pub fn eat(&mut self, kind: SyntaxKind) -> bool { - if self.at(kind) { + if self.nth(0).kind == kind { self.advance(); true } else { @@ -124,73 +113,69 @@ impl Parser { } pub fn at_whitespace(&self) -> bool { - self.nth(0, false).kind == SyntaxKind::Whitespace + self.nth(0).kind == SyntaxKind::Whitespace } - pub fn eat_whitespace(&mut self) { - while self.nth(0, false).token_type == TokenType::Whitespace { - self.advance(); + pub fn peek(&self) -> &Token { + self.nth(1) + } + + pub fn expect(&mut self, kind: SyntaxKind) { + if self.nth(0).kind == kind { + return; } + + self.error_at(format!("Expected {:#?}", kind)); } pub fn eof(&self) -> bool { self.pos == self.tokens.len() } - /// lookahead method. - /// - /// if `ignore_whitespace` is true, it will skip all whitespace tokens - pub fn nth(&self, lookahead: usize, ignore_whitespace: bool) -> &Token { - if ignore_whitespace { - let mut idx = 0; - let mut non_whitespace_token_ctr = 0; - loop { - match self.tokens.get(self.pos + idx) { - Some(token) => { - if !WHITESPACE_TOKENS.contains(&token.kind) { - if non_whitespace_token_ctr == lookahead { - return token; - } - non_whitespace_token_ctr += 1; - } - idx += 1; - } - None => { - return &self.eof_token; - } - } - } - } else { - match self.tokens.get(self.pos + lookahead) { - Some(token) => token, - None => &self.eof_token, + /// flush token buffer and applies all tokens + fn flush_token_buffer(&mut self) { + if self.whitespace_token_buffer.is_none() { + return; + } + while self.whitespace_token_buffer.unwrap() < self.pos { + self.whitespace_token_buffer = Some(self.whitespace_token_buffer.unwrap() + 1); + } + self.whitespace_token_buffer = None; + } + + pub fn next(&mut self) -> &Token { + loop { + if self.at_whitespace() { + self.advance(); + continue; } + break; } + + self.nth(0) } - /// checks if the current token is of `kind` - pub fn at(&self, kind: SyntaxKind) -> bool { - self.nth(0, false).kind == kind + /// collects an SyntaxError with an `error` message at the current position + fn error_at(&mut self, error: String) { + self.errors.push(SyntaxError::new_at_offset( + error, + self.tokens + .get(min( + self.tokens.len() - 1, + self.whitespace_token_buffer.unwrap_or(self.pos), + )) + .unwrap() + .span + .start(), + )); } - pub fn expect(&mut self, kind: SyntaxKind) { - if self.eat(kind) { - return; - } - if self.whitespace_token_buffer.is_some() { - self.error_at_pos( - format!( - "Expected {:#?}, found {:#?}", - kind, - self.tokens[self.whitespace_token_buffer.unwrap()].kind - ), - self.whitespace_token_buffer.unwrap(), - ); - } else { - self.error_at_pos( - format!("Expected {:#?}, found {:#?}", kind, self.nth(0, false)), - self.pos + 1, - ); + /// lookahead method. + fn nth(&self, lookahead: usize) -> &Token { + match self.tokens.get(self.pos + lookahead) { + Some(token) => token, + None => &self.eof_token, } } } + diff --git a/crates/pg_statement_splitter/src/split.rs b/crates/pg_statement_splitter/src/split.rs new file mode 100644 index 000000000..36f5e062b --- /dev/null +++ b/crates/pg_statement_splitter/src/split.rs @@ -0,0 +1,148 @@ +use pg_lexer::{SyntaxKind, Token, TokenType}; + +use crate::{data::at_statement_start, parser::Parser}; + +pub(crate) fn parse_source(p: &mut Parser) { + loop { + match p.peek() { + Token { + kind: SyntaxKind::Eof, + .. + } => { + break; + } + Token { + token_type: TokenType::Whitespace, + .. + } => { + p.advance(); + } + _ => { + parse_statement(p); + } + } + } +} + +fn parse_statement(p: &mut Parser) { + p.start_stmt(); + // todo move the below into parse_dml so that we dont have conflicts with parse stmt + match p.peek().kind { + SyntaxKind::With => { + parse_cte(p); + } + SyntaxKind::Select => { + parse_select(p); + } + SyntaxKind::Insert => { + parse_insert(p); + } + SyntaxKind::Update => { + parse_update(p); + } + SyntaxKind::DeleteP => { + parse_delete(p); + } + _ => { + parse_unknown(p); + } + } + p.close_stmt(); +} + +fn parse_cte(p: &mut Parser) { + println!("Parsing cte statement"); + p.start_stmt(); + + // todo make adance and all methods that call advance ignore whitespace + p.expect(SyntaxKind::With); + + loop { + p.expect(SyntaxKind::Ident); + p.expect(SyntaxKind::As); + parse_parenthesis(p); + + // todo handle comma + if !p.eat(SyntaxKind::Ascii00) { + break; + } + } + + parse_statement(p); +} + +// todo add common checker for within statements that checks for parenthesis, semicolons, statement +// starts etc and then we can add custom ones eg union for select +fn parse_select(p: &mut Parser) { + println!("Parsing select statement"); + p.start_stmt(); + + p.expect(SyntaxKind::Select); + + loop { + println!("parse select at {:?}", p.current().kind); + if p.eat(SyntaxKind::Ascii59) { + break; + } + + if p.at_double_newline() { + break; + } + + if p.at(SyntaxKind::Eof) { + break; + } + + if p.at(SyntaxKind::Ascii40) { + parse_parenthesis(p); + } + + if [ + SyntaxKind::Insert, + SyntaxKind::Update, + SyntaxKind::DeleteP, + SyntaxKind::Select, + ] + .contains(&p.peek().kind) + { + break; + } + + p.advance(); + } + + p.close_stmt(); +} + +fn parse_parenthesis(p: &mut Parser) { + p.expect(SyntaxKind::Ascii40); + + loop { + if p.eof() { + p.expect(SyntaxKind::Ascii41); + break; + } + if p.at(SyntaxKind::Ascii41) { + break; + } + } +} + +fn parse_insert(p: &mut Parser) { + p.expect(SyntaxKind::Insert); + p.expect(SyntaxKind::Into); +} + +fn parse_update(p: &mut Parser) { + p.expect(SyntaxKind::Update); +} + +fn parse_delete(p: &mut Parser) { + p.expect(SyntaxKind::DeleteP); + p.expect(SyntaxKind::From); + + p.eat_whitespace(); +} + + + From 9ae67e03cef9a5972e2c95ca0571ada2f9150660 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Sat, 5 Oct 2024 18:17:15 +0200 Subject: [PATCH 02/13] fix: save --- crates/pg_statement_splitter/src/lib.rs | 23 ++- crates/pg_statement_splitter/src/parser.rs | 130 +++++++++------ .../src/parser/common.rs | 109 +++++++++++++ .../src/{ => parser}/data.rs | 0 .../pg_statement_splitter/src/parser/dml.rs | 28 ++++ crates/pg_statement_splitter/src/split.rs | 148 ------------------ 6 files changed, 230 insertions(+), 208 deletions(-) create mode 100644 crates/pg_statement_splitter/src/parser/common.rs rename crates/pg_statement_splitter/src/{ => parser}/data.rs (100%) create mode 100644 crates/pg_statement_splitter/src/parser/dml.rs delete mode 100644 crates/pg_statement_splitter/src/split.rs diff --git a/crates/pg_statement_splitter/src/lib.rs b/crates/pg_statement_splitter/src/lib.rs index 08d903055..6a719aa5f 100644 --- a/crates/pg_statement_splitter/src/lib.rs +++ b/crates/pg_statement_splitter/src/lib.rs @@ -1,21 +1,17 @@ ///! Postgres Statement Splitter ///! ///! This crate provides a function to split a SQL source string into individual statements. - -mod data; -mod split; mod parser; mod syntax_error; -use parser::{Parse, Parser}; +use parser::{source, Parse, Parser}; -use pg_lexer::{lex}; -use split::parse_source; +use pg_lexer::lex; pub fn split(sql: &str) -> Parse { let mut parser = Parser::new(lex(sql)); - parse_source(&mut parser); + source(&mut parser); parser.finish() } @@ -25,8 +21,8 @@ mod tests { use super::*; #[test] - fn test_splitter() { - let input = "select 1 from contact;\nselect 1;"; + fn basic() { + let input = "select 1 from contact; select 1;"; let res = split(input); assert_eq!(res.ranges.len(), 2); @@ -35,7 +31,7 @@ mod tests { } #[test] - fn test_splitter_no_semicolons() { + fn no_semicolons() { let input = "select 1 from contact\nselect 1"; let res = split(input); @@ -45,13 +41,16 @@ mod tests { } #[test] - fn test_splitter_double_newlines() { + fn double_newlines() { let input = "select 1 from contact\nselect 1\n\nalter table t add column c int"; let res = split(input); assert_eq!(res.ranges.len(), 3); assert_eq!("select 1 from contact", input[res.ranges[0]].to_string()); assert_eq!("select 1", input[res.ranges[1]].to_string()); - assert_eq!("alter table t add column c int", input[res.ranges[2]].to_string()); + assert_eq!( + "alter table t add column c int", + input[res.ranges[2]].to_string() + ); } } diff --git a/crates/pg_statement_splitter/src/parser.rs b/crates/pg_statement_splitter/src/parser.rs index c7468e873..8bb7ecbec 100644 --- a/crates/pg_statement_splitter/src/parser.rs +++ b/crates/pg_statement_splitter/src/parser.rs @@ -1,3 +1,9 @@ +mod common; +mod data; +mod dml; + +pub use common::source; + use std::cmp::min; use pg_lexer::{SyntaxKind, Token, TokenType, WHITESPACE_TOKENS}; @@ -52,7 +58,7 @@ impl Parser { .iter() .map(|(start, end)| { let from = self.tokens.get(*start); - let to = self.tokens.get(end - 1); + let to = self.tokens.get(*end); // get text range from token range let text_start = from.unwrap().span.start(); let text_end = to.unwrap().span.end(); @@ -64,13 +70,17 @@ impl Parser { } } - /// Start statement at last non-whitespace token + /// Start statement at next non-whitespace token pub fn start_stmt(&mut self) { assert!(self.current_stmt_start.is_none()); if let Some(whitespace_token_buffer) = self.whitespace_token_buffer { self.current_stmt_start = Some(whitespace_token_buffer); } else { + while self.nth(0, false).token_type == TokenType::Whitespace { + self.advance(false); + } + self.current_stmt_start = Some(self.pos); } } @@ -79,6 +89,13 @@ impl Parser { pub fn close_stmt(&mut self) { assert!(self.current_stmt_start.is_some()); + println!( + "Closing statement {:?} / {:?}: {:?}", + self.whitespace_token_buffer, + self.pos, + self.tokens.get(self.pos) + ); + self.ranges.push(( self.current_stmt_start.unwrap(), self.whitespace_token_buffer.unwrap_or(self.pos), @@ -88,24 +105,41 @@ impl Parser { } /// applies token and advances - pub fn advance(&mut self) { - assert!(!self.eof()); + /// + /// if `ignore_whitespace` is true, it will advance the next non-whitespace token + pub fn advance(&mut self, ignore_whitespace: bool) { + assert!(!self.eof(ignore_whitespace)); + + loop { + let whitespace = match self.nth(0, false).kind { + SyntaxKind::Whitespace => { + if self.whitespace_token_buffer.is_none() { + self.whitespace_token_buffer = Some(self.pos); + } - if self.nth(0).kind == SyntaxKind::Whitespace { - if self.whitespace_token_buffer.is_none() { - self.whitespace_token_buffer = Some(self.pos); + true + } + _ => { + self.whitespace_token_buffer = None; + + false + } + }; + + self.pos += 1; + + if !whitespace || !ignore_whitespace { + break; } - } else { - self.flush_token_buffer(); } - self.pos += 1; } /// checks if the current token is of `kind` and advances if true /// returns true if the current token is of `kind` - pub fn eat(&mut self, kind: SyntaxKind) -> bool { - if self.nth(0).kind == kind { - self.advance(); + pub fn eat(&mut self, kind: SyntaxKind, ignore_whitespace: bool) -> bool { + if self.nth(1, ignore_whitespace).kind == kind { + println!("Eating {:?}", kind); + self.advance(ignore_whitespace); true } else { false @@ -113,46 +147,24 @@ impl Parser { } pub fn at_whitespace(&self) -> bool { - self.nth(0).kind == SyntaxKind::Whitespace + self.nth(0, false).kind == SyntaxKind::Whitespace } - pub fn peek(&self) -> &Token { - self.nth(1) + pub fn peek(&self, ignore_whitespace: bool) -> &Token { + self.nth(1, ignore_whitespace) } - pub fn expect(&mut self, kind: SyntaxKind) { - if self.nth(0).kind == kind { + pub fn expect(&mut self, kind: SyntaxKind, ignore_whitespace: bool) { + println!("Expecting {:?}", kind); + if self.eat(kind, ignore_whitespace) { return; } self.error_at(format!("Expected {:#?}", kind)); } - pub fn eof(&self) -> bool { - self.pos == self.tokens.len() - } - - /// flush token buffer and applies all tokens - fn flush_token_buffer(&mut self) { - if self.whitespace_token_buffer.is_none() { - return; - } - while self.whitespace_token_buffer.unwrap() < self.pos { - self.whitespace_token_buffer = Some(self.whitespace_token_buffer.unwrap() + 1); - } - self.whitespace_token_buffer = None; - } - - pub fn next(&mut self) -> &Token { - loop { - if self.at_whitespace() { - self.advance(); - continue; - } - break; - } - - self.nth(0) + pub fn eof(&self, ignore_whitespace: bool) -> bool { + self.peek(ignore_whitespace).kind == SyntaxKind::Eof } /// collects an SyntaxError with an `error` message at the current position @@ -171,11 +183,33 @@ impl Parser { } /// lookahead method. - fn nth(&self, lookahead: usize) -> &Token { - match self.tokens.get(self.pos + lookahead) { - Some(token) => token, - None => &self.eof_token, + /// + /// if `ignore_whitespace` is true, it will skip all whitespace tokens + pub fn nth(&self, lookahead: usize, ignore_whitespace: bool) -> &Token { + if ignore_whitespace { + let mut idx = 0; + let mut non_whitespace_token_ctr = 0; + loop { + match self.tokens.get(self.pos + idx) { + Some(token) => { + if !WHITESPACE_TOKENS.contains(&token.kind) { + if non_whitespace_token_ctr == lookahead { + return token; + } + non_whitespace_token_ctr += 1; + } + idx += 1; + } + None => { + return &self.eof_token; + } + } + } + } else { + match self.tokens.get(self.pos + lookahead) { + Some(token) => token, + None => &self.eof_token, + } } } } - diff --git a/crates/pg_statement_splitter/src/parser/common.rs b/crates/pg_statement_splitter/src/parser/common.rs new file mode 100644 index 000000000..8c2557c04 --- /dev/null +++ b/crates/pg_statement_splitter/src/parser/common.rs @@ -0,0 +1,109 @@ +use pg_lexer::{SyntaxKind, Token}; + +use super::{ + dml::{cte, select}, + Parser, +}; + +pub fn source(p: &mut Parser) { + loop { + // todo find a better way to handle stmt start + // same problem as below... for the first token we need to use nth(0), + // but for the rest we need to use peek + p.start_stmt(); + statement(p); + p.close_stmt(); + + if p.eof(true) { + break; + } + } +} + +pub(crate) fn statement(p: &mut Parser) { + // todo find a better way to handle first token + let token = if p.pos == 0 { + p.nth(0, true) + } else { + p.peek(true) + }; + + match token.kind { + SyntaxKind::With => { + cte(p); + } + SyntaxKind::Select => { + select(p); + } + SyntaxKind::Insert => { + todo!(); + // insert(p); + } + SyntaxKind::Update => { + todo!(); + // update(p); + } + SyntaxKind::DeleteP => { + todo!(); + // delete(p); + } + t => { + panic!("stmt: Unknown token {:?}", t); + // unknown(p); + } + } +} + +pub(crate) fn parenthesis(p: &mut Parser) { + p.expect(SyntaxKind::Ascii40, true); + + loop { + if p.eof(true) { + p.expect(SyntaxKind::Ascii41, true); + break; + } + if p.nth(0, true).kind == SyntaxKind::Ascii41 { + break; + } + } +} + +pub(crate) fn unknown(p: &mut Parser) { + loop { + match p.peek(false) { + t @ Token { + kind: SyntaxKind::Newline, + .. + } => { + if t.text.chars().count() > 1 { + p.advance(false); + break; + } + } + Token { + // ";" + kind: SyntaxKind::Ascii59, + .. + } => { + p.advance(false); + break; + } + Token { + kind: SyntaxKind::Eof, + .. + } => { + break; + } + Token { + kind: SyntaxKind::Ascii40, + .. + } => { + parenthesis(p); + } + t => { + println!("Unknown token {:?}", t); + p.advance(false); + } + } + } +} diff --git a/crates/pg_statement_splitter/src/data.rs b/crates/pg_statement_splitter/src/parser/data.rs similarity index 100% rename from crates/pg_statement_splitter/src/data.rs rename to crates/pg_statement_splitter/src/parser/data.rs diff --git a/crates/pg_statement_splitter/src/parser/dml.rs b/crates/pg_statement_splitter/src/parser/dml.rs new file mode 100644 index 000000000..8d1c5e46e --- /dev/null +++ b/crates/pg_statement_splitter/src/parser/dml.rs @@ -0,0 +1,28 @@ +use pg_lexer::SyntaxKind; + +use super::{ + common::{parenthesis, statement, unknown}, + Parser, +}; + +pub(crate) fn cte(p: &mut Parser) { + p.expect(SyntaxKind::With, true); + + loop { + p.expect(SyntaxKind::Ident, true); + p.expect(SyntaxKind::As, true); + parenthesis(p); + + if !p.eat(SyntaxKind::Ascii44, true) { + break; + } + } + + statement(p); +} + +pub(crate) fn select(p: &mut Parser) { + p.expect(SyntaxKind::Select, true); + + unknown(p); +} diff --git a/crates/pg_statement_splitter/src/split.rs b/crates/pg_statement_splitter/src/split.rs deleted file mode 100644 index 36f5e062b..000000000 --- a/crates/pg_statement_splitter/src/split.rs +++ /dev/null @@ -1,148 +0,0 @@ -use pg_lexer::{SyntaxKind, Token, TokenType}; - -use crate::{data::at_statement_start, parser::Parser}; - -pub(crate) fn parse_source(p: &mut Parser) { - loop { - match p.peek() { - Token { - kind: SyntaxKind::Eof, - .. - } => { - break; - } - Token { - token_type: TokenType::Whitespace, - .. - } => { - p.advance(); - } - _ => { - parse_statement(p); - } - } - } -} - -fn parse_statement(p: &mut Parser) { - p.start_stmt(); - // todo move the below into parse_dml so that we dont have conflicts with parse stmt - match p.peek().kind { - SyntaxKind::With => { - parse_cte(p); - } - SyntaxKind::Select => { - parse_select(p); - } - SyntaxKind::Insert => { - parse_insert(p); - } - SyntaxKind::Update => { - parse_update(p); - } - SyntaxKind::DeleteP => { - parse_delete(p); - } - _ => { - parse_unknown(p); - } - } - p.close_stmt(); -} - -fn parse_cte(p: &mut Parser) { - println!("Parsing cte statement"); - p.start_stmt(); - - // todo make adance and all methods that call advance ignore whitespace - p.expect(SyntaxKind::With); - - loop { - p.expect(SyntaxKind::Ident); - p.expect(SyntaxKind::As); - parse_parenthesis(p); - - // todo handle comma - if !p.eat(SyntaxKind::Ascii00) { - break; - } - } - - parse_statement(p); -} - -// todo add common checker for within statements that checks for parenthesis, semicolons, statement -// starts etc and then we can add custom ones eg union for select -fn parse_select(p: &mut Parser) { - println!("Parsing select statement"); - p.start_stmt(); - - p.expect(SyntaxKind::Select); - - loop { - println!("parse select at {:?}", p.current().kind); - if p.eat(SyntaxKind::Ascii59) { - break; - } - - if p.at_double_newline() { - break; - } - - if p.at(SyntaxKind::Eof) { - break; - } - - if p.at(SyntaxKind::Ascii40) { - parse_parenthesis(p); - } - - if [ - SyntaxKind::Insert, - SyntaxKind::Update, - SyntaxKind::DeleteP, - SyntaxKind::Select, - ] - .contains(&p.peek().kind) - { - break; - } - - p.advance(); - } - - p.close_stmt(); -} - -fn parse_parenthesis(p: &mut Parser) { - p.expect(SyntaxKind::Ascii40); - - loop { - if p.eof() { - p.expect(SyntaxKind::Ascii41); - break; - } - if p.at(SyntaxKind::Ascii41) { - break; - } - } -} - -fn parse_insert(p: &mut Parser) { - p.expect(SyntaxKind::Insert); - p.expect(SyntaxKind::Into); -} - -fn parse_update(p: &mut Parser) { - p.expect(SyntaxKind::Update); -} - -fn parse_delete(p: &mut Parser) { - p.expect(SyntaxKind::DeleteP); - p.expect(SyntaxKind::From); - - p.eat_whitespace(); -} - - - From 0d757f65668104b405913e47537be4619448a5f0 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Sun, 13 Oct 2024 17:25:47 +0200 Subject: [PATCH 03/13] fix: ci --- Cargo.lock | 85 +++++++- crates/pg_statement_splitter/Cargo.toml | 1 + crates/pg_statement_splitter/src/lib.rs | 14 +- crates/pg_statement_splitter/src/parser.rs | 187 +++++------------- .../src/parser/common.rs | 83 ++++---- .../pg_statement_splitter/src/parser/dml.rs | 10 +- 6 files changed, 180 insertions(+), 200 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0044279ed..025c82314 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -850,6 +850,12 @@ dependencies = [ "allocator-api2", ] +[[package]] +name = "hashbrown" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" + [[package]] name = "hashlink" version = "0.8.4" @@ -929,12 +935,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.2.2" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "824b2ae422412366ba479e8111fd301f7b5faece8149317bb81925979a53f520" +checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" dependencies = [ "equivalent", - "hashbrown 0.14.3", + "hashbrown 0.15.0", ] [[package]] @@ -1187,6 +1193,39 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "ntest" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb183f0a1da7a937f672e5ee7b7edb727bf52b8a52d531374ba8ebb9345c0330" +dependencies = [ + "ntest_test_cases", + "ntest_timeout", +] + +[[package]] +name = "ntest_test_cases" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16d0d3f2a488592e5368ebbe996e7f1d44aa13156efad201f5b4d84e150eaa93" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "ntest_timeout" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcc7c92f190c97f79b4a332f5e81dcf68c8420af2045c936c9be0bc9de6f63b5" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "num-bigint-dig" version = "0.8.4" @@ -1319,7 +1358,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e1d3afd2628e69da2be385eb6f2fd57c8ac7977ceeff6dc166ff1657b0e386a9" dependencies = [ "fixedbitset", - "indexmap 2.2.2", + "indexmap 2.6.0", ] [[package]] @@ -1507,6 +1546,7 @@ dependencies = [ name = "pg_statement_splitter" version = "0.0.0" dependencies = [ + "ntest", "pg_lexer", "pg_query", "text-size", @@ -1668,6 +1708,15 @@ dependencies = [ "syn 2.0.71", ] +[[package]] +name = "proc-macro-crate" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecf48c7ca261d60b74ab1a7b20da18bede46776b2e55535cb958eb595c5fa7b" +dependencies = [ + "toml_edit", +] + [[package]] name = "proc-macro2" version = "1.0.86" @@ -2163,7 +2212,7 @@ dependencies = [ "futures-util", "hashlink", "hex", - "indexmap 2.2.2", + "indexmap 2.6.0", "log", "memchr", "once_cell", @@ -2452,6 +2501,23 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "toml_datetime" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" + +[[package]] +name = "toml_edit" +version = "0.22.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" +dependencies = [ + "indexmap 2.6.0", + "toml_datetime", + "winnow", +] + [[package]] name = "tracing" version = "0.1.40" @@ -2858,6 +2924,15 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" +[[package]] +name = "winnow" +version = "0.6.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36c1fec1a2bb5866f07c25f68c26e565c4c200aebb96d7e55710c19d3e8ac49b" +dependencies = [ + "memchr", +] + [[package]] name = "write-json" version = "0.1.4" diff --git a/crates/pg_statement_splitter/Cargo.toml b/crates/pg_statement_splitter/Cargo.toml index 15a306807..18664bbef 100644 --- a/crates/pg_statement_splitter/Cargo.toml +++ b/crates/pg_statement_splitter/Cargo.toml @@ -6,6 +6,7 @@ edition = "2021" [dependencies] pg_lexer.workspace = true text-size = "1.1.1" +ntest = "0.9.3" [dev-dependencies] pg_query = "0.8" diff --git a/crates/pg_statement_splitter/src/lib.rs b/crates/pg_statement_splitter/src/lib.rs index 6a719aa5f..95ce680bd 100644 --- a/crates/pg_statement_splitter/src/lib.rs +++ b/crates/pg_statement_splitter/src/lib.rs @@ -6,10 +6,8 @@ mod syntax_error; use parser::{source, Parse, Parser}; -use pg_lexer::lex; - pub fn split(sql: &str) -> Parse { - let mut parser = Parser::new(lex(sql)); + let mut parser = Parser::new(sql); source(&mut parser); @@ -18,9 +16,12 @@ pub fn split(sql: &str) -> Parse { #[cfg(test)] mod tests { + use ntest::timeout; + use super::*; #[test] + #[timeout(1000)] fn basic() { let input = "select 1 from contact; select 1;"; @@ -42,15 +43,12 @@ mod tests { #[test] fn double_newlines() { - let input = "select 1 from contact\nselect 1\n\nalter table t add column c int"; + let input = "select 1 from contact\n\nselect 1\n\nselect 3"; let res = split(input); assert_eq!(res.ranges.len(), 3); assert_eq!("select 1 from contact", input[res.ranges[0]].to_string()); assert_eq!("select 1", input[res.ranges[1]].to_string()); - assert_eq!( - "alter table t add column c int", - input[res.ranges[2]].to_string() - ); + assert_eq!("select 3", input[res.ranges[2]].to_string()); } } diff --git a/crates/pg_statement_splitter/src/parser.rs b/crates/pg_statement_splitter/src/parser.rs index 8bb7ecbec..64e393c12 100644 --- a/crates/pg_statement_splitter/src/parser.rs +++ b/crates/pg_statement_splitter/src/parser.rs @@ -6,7 +6,7 @@ pub use common::source; use std::cmp::min; -use pg_lexer::{SyntaxKind, Token, TokenType, WHITESPACE_TOKENS}; +use pg_lexer::{lex, SyntaxKind, Token, WHITESPACE_TOKENS}; use text_size::{TextRange, TextSize}; use crate::syntax_error::SyntaxError; @@ -14,19 +14,17 @@ use crate::syntax_error::SyntaxError; /// Main parser that exposes the `cstree` api, and collects errors and statements pub struct Parser { /// The ranges of the statements - ranges: Vec<(usize, usize)>, + ranges: Vec, /// The syntax errors accumulated during parsing errors: Vec, /// The start of the current statement, if any - current_stmt_start: Option, + current_stmt_start: Option, /// The tokens to parse pub tokens: Vec, - /// The current position in the token stream - pub pos: usize, - /// index from which whitespace tokens are buffered - pub whitespace_token_buffer: Option, eof_token: Token, + + last_token_end: Option, } /// Result of Building @@ -39,177 +37,94 @@ pub struct Parse { } impl Parser { - pub fn new(tokens: Vec) -> Self { + pub fn new(sql: &str) -> Self { + // we dont care about whitespace tokens, except for double newlines + // to make everything simpler, we just filter them out + // the token holds the text range, so we dont need to worry about that + let tokens = lex(sql) + .iter() + .filter(|t| { + return !WHITESPACE_TOKENS.contains(&t.kind) + || (t.kind == SyntaxKind::Newline && t.text.chars().count() > 1); + }) + .rev() + .cloned() + .collect::>(); + Self { - eof_token: Token::eof(usize::from(tokens.last().unwrap().span.end())), ranges: Vec::new(), + eof_token: Token::eof(usize::from(tokens.first().unwrap().span.end())), errors: Vec::new(), current_stmt_start: None, tokens, - pos: 0, - whitespace_token_buffer: None, + last_token_end: None, } } pub fn finish(self) -> Parse { Parse { - ranges: self - .ranges - .iter() - .map(|(start, end)| { - let from = self.tokens.get(*start); - let to = self.tokens.get(*end); - // get text range from token range - let text_start = from.unwrap().span.start(); - let text_end = to.unwrap().span.end(); - - TextRange::new(text_start, text_end) - }) - .collect(), + ranges: self.ranges, errors: self.errors, } } - /// Start statement at next non-whitespace token - pub fn start_stmt(&mut self) { + /// Start statement + pub fn start_stmt(&mut self) -> Token { assert!(self.current_stmt_start.is_none()); - if let Some(whitespace_token_buffer) = self.whitespace_token_buffer { - self.current_stmt_start = Some(whitespace_token_buffer); - } else { - while self.nth(0, false).token_type == TokenType::Whitespace { - self.advance(false); - } + let token = self.peek(); - self.current_stmt_start = Some(self.pos); - } + self.current_stmt_start = Some(token.span.start()); + + token } - /// Close statement at last non-whitespace token + /// Close statement pub fn close_stmt(&mut self) { - assert!(self.current_stmt_start.is_some()); - - println!( - "Closing statement {:?} / {:?}: {:?}", - self.whitespace_token_buffer, - self.pos, - self.tokens.get(self.pos) - ); - - self.ranges.push(( - self.current_stmt_start.unwrap(), - self.whitespace_token_buffer.unwrap_or(self.pos), + self.ranges.push(TextRange::new( + self.current_stmt_start.expect("Expected active statement"), + self.last_token_end.expect("Expected last token end"), )); self.current_stmt_start = None; } - /// applies token and advances - /// - /// if `ignore_whitespace` is true, it will advance the next non-whitespace token - pub fn advance(&mut self, ignore_whitespace: bool) { - assert!(!self.eof(ignore_whitespace)); - - loop { - let whitespace = match self.nth(0, false).kind { - SyntaxKind::Whitespace => { - if self.whitespace_token_buffer.is_none() { - self.whitespace_token_buffer = Some(self.pos); - } - - true - } - _ => { - self.whitespace_token_buffer = None; - - false - } - }; - - self.pos += 1; - - if !whitespace || !ignore_whitespace { - break; - } - } + fn advance(&mut self) -> Token { + let token = self.tokens.pop().unwrap_or(self.eof_token.clone()); + + self.last_token_end = Some(token.span.end()); + + token + } + + fn peek(&mut self) -> Token { + self.tokens + .last() + .cloned() + .unwrap_or(self.eof_token.clone()) } /// checks if the current token is of `kind` and advances if true /// returns true if the current token is of `kind` - pub fn eat(&mut self, kind: SyntaxKind, ignore_whitespace: bool) -> bool { - if self.nth(1, ignore_whitespace).kind == kind { - println!("Eating {:?}", kind); - self.advance(ignore_whitespace); + pub fn eat(&mut self, kind: SyntaxKind) -> bool { + if self.peek().kind == kind { + self.advance(); true } else { false } } - pub fn at_whitespace(&self) -> bool { - self.nth(0, false).kind == SyntaxKind::Whitespace - } - - pub fn peek(&self, ignore_whitespace: bool) -> &Token { - self.nth(1, ignore_whitespace) - } - - pub fn expect(&mut self, kind: SyntaxKind, ignore_whitespace: bool) { - println!("Expecting {:?}", kind); - if self.eat(kind, ignore_whitespace) { + pub fn expect(&mut self, kind: SyntaxKind) { + if self.eat(kind) { return; } self.error_at(format!("Expected {:#?}", kind)); } - pub fn eof(&self, ignore_whitespace: bool) -> bool { - self.peek(ignore_whitespace).kind == SyntaxKind::Eof - } - /// collects an SyntaxError with an `error` message at the current position fn error_at(&mut self, error: String) { - self.errors.push(SyntaxError::new_at_offset( - error, - self.tokens - .get(min( - self.tokens.len() - 1, - self.whitespace_token_buffer.unwrap_or(self.pos), - )) - .unwrap() - .span - .start(), - )); - } - - /// lookahead method. - /// - /// if `ignore_whitespace` is true, it will skip all whitespace tokens - pub fn nth(&self, lookahead: usize, ignore_whitespace: bool) -> &Token { - if ignore_whitespace { - let mut idx = 0; - let mut non_whitespace_token_ctr = 0; - loop { - match self.tokens.get(self.pos + idx) { - Some(token) => { - if !WHITESPACE_TOKENS.contains(&token.kind) { - if non_whitespace_token_ctr == lookahead { - return token; - } - non_whitespace_token_ctr += 1; - } - idx += 1; - } - None => { - return &self.eof_token; - } - } - } - } else { - match self.tokens.get(self.pos + lookahead) { - Some(token) => token, - None => &self.eof_token, - } - } + todo!(); } } diff --git a/crates/pg_statement_splitter/src/parser/common.rs b/crates/pg_statement_splitter/src/parser/common.rs index 8c2557c04..f20da88cd 100644 --- a/crates/pg_statement_splitter/src/parser/common.rs +++ b/crates/pg_statement_splitter/src/parser/common.rs @@ -1,34 +1,36 @@ -use pg_lexer::{SyntaxKind, Token}; +use pg_lexer::{SyntaxKind, Token, TokenType}; use super::{ + data::at_statement_start, dml::{cte, select}, Parser, }; pub fn source(p: &mut Parser) { loop { - // todo find a better way to handle stmt start - // same problem as below... for the first token we need to use nth(0), - // but for the rest we need to use peek - p.start_stmt(); - statement(p); - p.close_stmt(); - - if p.eof(true) { - break; + match p.peek() { + Token { + kind: SyntaxKind::Eof, + .. + } => { + break; + } + Token { + token_type: TokenType::Whitespace | TokenType::NoKeyword, + .. + } => { + p.advance(); + } + _ => { + statement(p); + } } } } pub(crate) fn statement(p: &mut Parser) { - // todo find a better way to handle first token - let token = if p.pos == 0 { - p.nth(0, true) - } else { - p.peek(true) - }; - - match token.kind { + p.start_stmt(); + match p.peek().kind { SyntaxKind::With => { cte(p); } @@ -52,44 +54,30 @@ pub(crate) fn statement(p: &mut Parser) { // unknown(p); } } + p.close_stmt(); } pub(crate) fn parenthesis(p: &mut Parser) { - p.expect(SyntaxKind::Ascii40, true); + p.expect(SyntaxKind::Ascii40); loop { - if p.eof(true) { - p.expect(SyntaxKind::Ascii41, true); - break; - } - if p.nth(0, true).kind == SyntaxKind::Ascii41 { - break; + match p.peek().kind { + SyntaxKind::Ascii41 | SyntaxKind::Eof => { + p.advance(); + break; + } + _ => { + p.advance(); + } } } } pub(crate) fn unknown(p: &mut Parser) { loop { - match p.peek(false) { - t @ Token { - kind: SyntaxKind::Newline, - .. - } => { - if t.text.chars().count() > 1 { - p.advance(false); - break; - } - } + match p.peek() { Token { - // ";" - kind: SyntaxKind::Ascii59, - .. - } => { - p.advance(false); - break; - } - Token { - kind: SyntaxKind::Eof, + kind: SyntaxKind::Newline | SyntaxKind::Ascii59 | SyntaxKind::Eof, .. } => { break; @@ -101,8 +89,11 @@ pub(crate) fn unknown(p: &mut Parser) { parenthesis(p); } t => { - println!("Unknown token {:?}", t); - p.advance(false); + if at_statement_start(t.kind) { + break; + } + + p.advance(); } } } diff --git a/crates/pg_statement_splitter/src/parser/dml.rs b/crates/pg_statement_splitter/src/parser/dml.rs index 8d1c5e46e..d6e63915b 100644 --- a/crates/pg_statement_splitter/src/parser/dml.rs +++ b/crates/pg_statement_splitter/src/parser/dml.rs @@ -6,14 +6,14 @@ use super::{ }; pub(crate) fn cte(p: &mut Parser) { - p.expect(SyntaxKind::With, true); + p.expect(SyntaxKind::With); loop { - p.expect(SyntaxKind::Ident, true); - p.expect(SyntaxKind::As, true); + p.expect(SyntaxKind::Ident); + p.expect(SyntaxKind::As); parenthesis(p); - if !p.eat(SyntaxKind::Ascii44, true) { + if !p.eat(SyntaxKind::Ascii44) { break; } } @@ -22,7 +22,7 @@ pub(crate) fn cte(p: &mut Parser) { } pub(crate) fn select(p: &mut Parser) { - p.expect(SyntaxKind::Select, true); + p.expect(SyntaxKind::Select); unknown(p); } From 58c037485ff9d316ca10b26d93d81f23406a047f Mon Sep 17 00:00:00 2001 From: psteinroe Date: Fri, 18 Oct 2024 08:58:27 +0200 Subject: [PATCH 04/13] fix: address pr feedback --- crates/pg_statement_splitter/src/parser.rs | 10 +++++++--- crates/pg_statement_splitter/src/parser/common.rs | 2 +- crates/pg_statement_splitter/src/parser/data.rs | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/crates/pg_statement_splitter/src/parser.rs b/crates/pg_statement_splitter/src/parser.rs index 64e393c12..0aa3ecfa2 100644 --- a/crates/pg_statement_splitter/src/parser.rs +++ b/crates/pg_statement_splitter/src/parser.rs @@ -4,14 +4,13 @@ mod dml; pub use common::source; -use std::cmp::min; - use pg_lexer::{lex, SyntaxKind, Token, WHITESPACE_TOKENS}; use text_size::{TextRange, TextSize}; use crate::syntax_error::SyntaxError; /// Main parser that exposes the `cstree` api, and collects errors and statements +/// It is modelled after a Pratt Parser. For a gentle introduction to Pratt Parsing, see https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html pub struct Parser { /// The ranges of the statements ranges: Vec, @@ -53,7 +52,12 @@ impl Parser { Self { ranges: Vec::new(), - eof_token: Token::eof(usize::from(tokens.first().unwrap().span.end())), + eof_token: Token::eof(usize::from( + tokens + .first() + .map(|t| t.span.start()) + .unwrap_or(TextSize::from(0)), + )), errors: Vec::new(), current_stmt_start: None, tokens, diff --git a/crates/pg_statement_splitter/src/parser/common.rs b/crates/pg_statement_splitter/src/parser/common.rs index f20da88cd..842507d4d 100644 --- a/crates/pg_statement_splitter/src/parser/common.rs +++ b/crates/pg_statement_splitter/src/parser/common.rs @@ -50,7 +50,7 @@ pub(crate) fn statement(p: &mut Parser) { // delete(p); } t => { - panic!("stmt: Unknown token {:?}", t); + panic!("stmt: Unknown start token {:?}", t); // unknown(p); } } diff --git a/crates/pg_statement_splitter/src/parser/data.rs b/crates/pg_statement_splitter/src/parser/data.rs index 87bd86734..bb425265c 100644 --- a/crates/pg_statement_splitter/src/parser/data.rs +++ b/crates/pg_statement_splitter/src/parser/data.rs @@ -1,6 +1,6 @@ use pg_lexer::SyntaxKind; -pub static STATEMENT_START_TOKENS: &[SyntaxKind] = &[ +static STATEMENT_START_TOKENS: &[SyntaxKind] = &[ SyntaxKind::With, SyntaxKind::Select, SyntaxKind::Insert, From 3849cf7984d56e5cbf9b026049c4a074967d90fb Mon Sep 17 00:00:00 2001 From: psteinroe Date: Fri, 18 Oct 2024 11:44:55 +0200 Subject: [PATCH 05/13] refactor: parser now uses a pointer into the token vector instead of popping and cloning --- crates/pg_statement_splitter/src/parser.rs | 99 +++++++++++++------ .../src/parser/common.rs | 9 +- 2 files changed, 75 insertions(+), 33 deletions(-) diff --git a/crates/pg_statement_splitter/src/parser.rs b/crates/pg_statement_splitter/src/parser.rs index 0aa3ecfa2..bb2e68b96 100644 --- a/crates/pg_statement_splitter/src/parser.rs +++ b/crates/pg_statement_splitter/src/parser.rs @@ -13,17 +13,17 @@ use crate::syntax_error::SyntaxError; /// It is modelled after a Pratt Parser. For a gentle introduction to Pratt Parsing, see https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html pub struct Parser { /// The ranges of the statements - ranges: Vec, + ranges: Vec<(usize, usize)>, /// The syntax errors accumulated during parsing errors: Vec, /// The start of the current statement, if any - current_stmt_start: Option, + current_stmt_start: Option, /// The tokens to parse pub tokens: Vec, eof_token: Token, - last_token_end: Option, + next_pos: usize, } /// Result of Building @@ -46,66 +46,96 @@ impl Parser { return !WHITESPACE_TOKENS.contains(&t.kind) || (t.kind == SyntaxKind::Newline && t.text.chars().count() > 1); }) - .rev() .cloned() .collect::>(); + let eof_token = Token::eof(usize::from( + tokens + .last() + .map(|t| t.span.start()) + .unwrap_or(TextSize::from(0)), + )); + + // next_pos should be the initialised with the first valid token already + let mut next_pos = 0; + loop { + let token = tokens.get(next_pos).unwrap_or(&eof_token); + + if is_irrelevant_token(token) { + next_pos += 1; + } else { + break; + } + } + Self { ranges: Vec::new(), - eof_token: Token::eof(usize::from( - tokens - .first() - .map(|t| t.span.start()) - .unwrap_or(TextSize::from(0)), - )), + eof_token, errors: Vec::new(), current_stmt_start: None, tokens, - last_token_end: None, + next_pos, } } pub fn finish(self) -> Parse { Parse { - ranges: self.ranges, + ranges: self + .ranges + .iter() + .map(|(start, end)| { + println!("{} {}", start, end); + let from = self.tokens.get(*start); + let to = self.tokens.get(*end).unwrap_or(&self.eof_token); + + TextRange::new(from.unwrap().span.start(), to.span.end()) + }) + .collect(), errors: self.errors, } } /// Start statement - pub fn start_stmt(&mut self) -> Token { + pub fn start_stmt(&mut self) { assert!(self.current_stmt_start.is_none()); - - let token = self.peek(); - - self.current_stmt_start = Some(token.span.start()); - - token + self.current_stmt_start = Some(self.next_pos); } /// Close statement pub fn close_stmt(&mut self) { - self.ranges.push(TextRange::new( + assert!(self.next_pos > 0); + + self.ranges.push(( self.current_stmt_start.expect("Expected active statement"), - self.last_token_end.expect("Expected last token end"), + self.next_pos - 1, )); self.current_stmt_start = None; } - fn advance(&mut self) -> Token { - let token = self.tokens.pop().unwrap_or(self.eof_token.clone()); - - self.last_token_end = Some(token.span.end()); - - token + fn advance(&mut self) -> &Token { + let mut first_relevant_token = None; + loop { + let token = self.tokens.get(self.next_pos).unwrap_or(&self.eof_token); + + // we need to continue with next_pos until the next relevant token after we already + // found the first one + if !is_irrelevant_token(token) { + if let Some(t) = first_relevant_token { + return t; + } + first_relevant_token = Some(token); + } + + self.next_pos += 1; + } } - fn peek(&mut self) -> Token { - self.tokens - .last() - .cloned() - .unwrap_or(self.eof_token.clone()) + fn peek(&self) -> &Token { + match self.tokens.get(self.next_pos) { + Some(token) => token, + None => &self.eof_token, + } } /// checks if the current token is of `kind` and advances if true @@ -132,3 +162,8 @@ impl Parser { todo!(); } } + +fn is_irrelevant_token(t: &Token) -> bool { + return WHITESPACE_TOKENS.contains(&t.kind) + && (t.kind != SyntaxKind::Newline || t.text.chars().count() == 1); +} diff --git a/crates/pg_statement_splitter/src/parser/common.rs b/crates/pg_statement_splitter/src/parser/common.rs index 842507d4d..63076ec35 100644 --- a/crates/pg_statement_splitter/src/parser/common.rs +++ b/crates/pg_statement_splitter/src/parser/common.rs @@ -77,7 +77,14 @@ pub(crate) fn unknown(p: &mut Parser) { loop { match p.peek() { Token { - kind: SyntaxKind::Newline | SyntaxKind::Ascii59 | SyntaxKind::Eof, + kind: SyntaxKind::Ascii59, + .. + } => { + p.advance(); + break; + } + Token { + kind: SyntaxKind::Newline | SyntaxKind::Eof, .. } => { break; From 4e9dc81291ee5f8773846e6637d8586c14fff5ab Mon Sep 17 00:00:00 2001 From: psteinroe Date: Fri, 18 Oct 2024 11:58:07 +0200 Subject: [PATCH 06/13] add test helper --- crates/pg_statement_splitter/src/lib.rs | 52 ++++++++++++++-------- crates/pg_statement_splitter/src/parser.rs | 1 - 2 files changed, 33 insertions(+), 20 deletions(-) diff --git a/crates/pg_statement_splitter/src/lib.rs b/crates/pg_statement_splitter/src/lib.rs index 95ce680bd..56ec0d974 100644 --- a/crates/pg_statement_splitter/src/lib.rs +++ b/crates/pg_statement_splitter/src/lib.rs @@ -20,35 +20,49 @@ mod tests { use super::*; + struct Tester { + input: String, + parse: Parse, + } + + impl From<&str> for Tester { + fn from(input: &str) -> Self { + Tester { + parse: split(input), + input: input.to_string(), + } + } + } + + impl Tester { + fn expect_statements(&self, expected: Vec<&str>) { + assert_eq!(self.parse.ranges.len(), expected.len()); + + for (range, expected) in self.parse.ranges.iter().zip(expected.iter()) { + assert_eq!(*expected, self.input[*range].to_string()); + } + } + } + #[test] #[timeout(1000)] fn basic() { - let input = "select 1 from contact; select 1;"; - - let res = split(input); - assert_eq!(res.ranges.len(), 2); - assert_eq!("select 1 from contact;", input[res.ranges[0]].to_string()); - assert_eq!("select 1;", input[res.ranges[1]].to_string()); + Tester::from("select 1 from contact; select 1;") + .expect_statements(vec!["select 1 from contact;", "select 1;"]); } #[test] fn no_semicolons() { - let input = "select 1 from contact\nselect 1"; - - let res = split(input); - assert_eq!(res.ranges.len(), 2); - assert_eq!("select 1 from contact", input[res.ranges[0]].to_string()); - assert_eq!("select 1", input[res.ranges[1]].to_string()); + Tester::from("select 1 from contact\nselect 1") + .expect_statements(vec!["select 1 from contact", "select 1"]); } #[test] fn double_newlines() { - let input = "select 1 from contact\n\nselect 1\n\nselect 3"; - - let res = split(input); - assert_eq!(res.ranges.len(), 3); - assert_eq!("select 1 from contact", input[res.ranges[0]].to_string()); - assert_eq!("select 1", input[res.ranges[1]].to_string()); - assert_eq!("select 3", input[res.ranges[2]].to_string()); + Tester::from("select 1 from contact\n\nselect 1\n\nselect 3").expect_statements(vec![ + "select 1 from contact", + "select 1", + "select 3", + ]); } } diff --git a/crates/pg_statement_splitter/src/parser.rs b/crates/pg_statement_splitter/src/parser.rs index bb2e68b96..88981e5f3 100644 --- a/crates/pg_statement_splitter/src/parser.rs +++ b/crates/pg_statement_splitter/src/parser.rs @@ -84,7 +84,6 @@ impl Parser { .ranges .iter() .map(|(start, end)| { - println!("{} {}", start, end); let from = self.tokens.get(*start); let to = self.tokens.get(*end).unwrap_or(&self.eof_token); From 345b1ecd0a78bb9c249e8d5c2434e4de60fe29d0 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Sat, 19 Oct 2024 11:02:56 +0200 Subject: [PATCH 07/13] feat: add remaining dml statements --- crates/pg_statement_splitter/src/lib.rs | 56 +++++++++++++++++-- crates/pg_statement_splitter/src/parser.rs | 27 ++++++--- .../src/parser/common.rs | 20 +++---- .../pg_statement_splitter/src/parser/dml.rs | 20 +++++++ 4 files changed, 99 insertions(+), 24 deletions(-) diff --git a/crates/pg_statement_splitter/src/lib.rs b/crates/pg_statement_splitter/src/lib.rs index 56ec0d974..3ecceed75 100644 --- a/crates/pg_statement_splitter/src/lib.rs +++ b/crates/pg_statement_splitter/src/lib.rs @@ -1,6 +1,6 @@ -///! Postgres Statement Splitter -///! -///! This crate provides a function to split a SQL source string into individual statements. +//! Postgres Statement Splitter +//! +//! This crate provides a function to split a SQL source string into individual statements. mod parser; mod syntax_error; @@ -36,7 +36,13 @@ mod tests { impl Tester { fn expect_statements(&self, expected: Vec<&str>) { - assert_eq!(self.parse.ranges.len(), expected.len()); + assert_eq!( + self.parse.ranges.len(), + expected.len(), + "Expected {} statements, got {}", + expected.len(), + self.parse.ranges.len() + ); for (range, expected) in self.parse.ranges.iter().zip(expected.iter()) { assert_eq!(*expected, self.input[*range].to_string()); @@ -65,4 +71,46 @@ mod tests { "select 3", ]); } + + #[test] + fn insert_into() { + Tester::from("randomness\ninsert into tbl (id) values (1)\nselect 3").expect_statements( + vec!["randomness", "insert into tbl (id) values (1)", "select 3"], + ); + } + + #[test] + fn update() { + Tester::from("more randomness\nupdate tbl set col = '1'\n\nselect 3").expect_statements( + vec!["more randomness", "update tbl set col = '1'", "select 3"], + ); + } + + #[test] + fn delete_from() { + Tester::from("more randomness\ndelete from test where id = 1\n\nselect 3") + .expect_statements(vec![ + "more randomness", + "delete from test where id = 1", + "select 3", + ]); + } + + #[test] + fn unknown() { + Tester::from("random stuff\n\nmore randomness\n\nselect 3").expect_statements(vec![ + "random stuff", + "more randomness", + "select 3", + ]); + } + + #[test] + fn unknown_2() { + Tester::from("random stuff\nselect 1\n\nselect 3").expect_statements(vec![ + "random stuff", + "select 1", + "select 3", + ]); + } } diff --git a/crates/pg_statement_splitter/src/parser.rs b/crates/pg_statement_splitter/src/parser.rs index 88981e5f3..2bc79509c 100644 --- a/crates/pg_statement_splitter/src/parser.rs +++ b/crates/pg_statement_splitter/src/parser.rs @@ -40,14 +40,7 @@ impl Parser { // we dont care about whitespace tokens, except for double newlines // to make everything simpler, we just filter them out // the token holds the text range, so we dont need to worry about that - let tokens = lex(sql) - .iter() - .filter(|t| { - return !WHITESPACE_TOKENS.contains(&t.kind) - || (t.kind == SyntaxKind::Newline && t.text.chars().count() > 1); - }) - .cloned() - .collect::>(); + let tokens = lex(sql); let eof_token = Token::eof(usize::from( tokens @@ -104,9 +97,25 @@ impl Parser { pub fn close_stmt(&mut self) { assert!(self.next_pos > 0); + // go back the positions until we find the first relevant token + let mut end_token_pos = self.next_pos - 1; + loop { + let token = self.tokens.get(end_token_pos); + + if end_token_pos == 0 || token.is_none() { + break; + } + + if !is_irrelevant_token(token.unwrap()) { + break; + } + + end_token_pos -= 1; + } + self.ranges.push(( self.current_stmt_start.expect("Expected active statement"), - self.next_pos - 1, + end_token_pos, )); self.current_stmt_start = None; diff --git a/crates/pg_statement_splitter/src/parser/common.rs b/crates/pg_statement_splitter/src/parser/common.rs index 63076ec35..6bd5cc814 100644 --- a/crates/pg_statement_splitter/src/parser/common.rs +++ b/crates/pg_statement_splitter/src/parser/common.rs @@ -2,7 +2,7 @@ use pg_lexer::{SyntaxKind, Token, TokenType}; use super::{ data::at_statement_start, - dml::{cte, select}, + dml::{cte, delete, insert, select, update}, Parser, }; @@ -16,7 +16,9 @@ pub fn source(p: &mut Parser) { break; } Token { - token_type: TokenType::Whitespace | TokenType::NoKeyword, + // we might want to ignore TokenType::NoKeyword here too + // but this will lead to invalid statements to not being picked up + token_type: TokenType::Whitespace, .. } => { p.advance(); @@ -38,20 +40,16 @@ pub(crate) fn statement(p: &mut Parser) { select(p); } SyntaxKind::Insert => { - todo!(); - // insert(p); + insert(p); } SyntaxKind::Update => { - todo!(); - // update(p); + update(p); } SyntaxKind::DeleteP => { - todo!(); - // delete(p); + delete(p); } - t => { - panic!("stmt: Unknown start token {:?}", t); - // unknown(p); + _ => { + unknown(p); } } p.close_stmt(); diff --git a/crates/pg_statement_splitter/src/parser/dml.rs b/crates/pg_statement_splitter/src/parser/dml.rs index d6e63915b..2a5fa96a5 100644 --- a/crates/pg_statement_splitter/src/parser/dml.rs +++ b/crates/pg_statement_splitter/src/parser/dml.rs @@ -26,3 +26,23 @@ pub(crate) fn select(p: &mut Parser) { unknown(p); } + +pub(crate) fn insert(p: &mut Parser) { + p.expect(SyntaxKind::Insert); + p.expect(SyntaxKind::Into); + + unknown(p); +} + +pub(crate) fn update(p: &mut Parser) { + p.expect(SyntaxKind::Update); + + unknown(p); +} + +pub(crate) fn delete(p: &mut Parser) { + p.expect(SyntaxKind::DeleteP); + p.expect(SyntaxKind::From); + + unknown(p); +} From 2d130d7fecec78b9cdbbc42d7cfa0ed369a73c92 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Sat, 19 Oct 2024 11:05:02 +0200 Subject: [PATCH 08/13] cleanup stmts --- crates/pg_statement_splitter/src/parser.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/crates/pg_statement_splitter/src/parser.rs b/crates/pg_statement_splitter/src/parser.rs index 2bc79509c..ece5ec638 100644 --- a/crates/pg_statement_splitter/src/parser.rs +++ b/crates/pg_statement_splitter/src/parser.rs @@ -37,9 +37,6 @@ pub struct Parse { impl Parser { pub fn new(sql: &str) -> Self { - // we dont care about whitespace tokens, except for double newlines - // to make everything simpler, we just filter them out - // the token holds the text range, so we dont need to worry about that let tokens = lex(sql); let eof_token = Token::eof(usize::from( From d21f26106dfc42f3cd22ebb7abdce04db0238e7d Mon Sep 17 00:00:00 2001 From: psteinroe Date: Sun, 20 Oct 2024 16:57:45 +0200 Subject: [PATCH 09/13] fix: handle insert update delete and select within unknown --- crates/pg_statement_splitter/src/lib.rs | 28 +++++++++ crates/pg_statement_splitter/src/parser.rs | 21 ++++++- .../src/parser/common.rs | 63 +++++++++++++++++-- .../pg_statement_splitter/src/parser/data.rs | 6 +- .../pg_statement_splitter/src/parser/ddl.rs | 9 +++ 5 files changed, 119 insertions(+), 8 deletions(-) create mode 100644 crates/pg_statement_splitter/src/parser/ddl.rs diff --git a/crates/pg_statement_splitter/src/lib.rs b/crates/pg_statement_splitter/src/lib.rs index 3ecceed75..c9d013a02 100644 --- a/crates/pg_statement_splitter/src/lib.rs +++ b/crates/pg_statement_splitter/src/lib.rs @@ -72,6 +72,34 @@ mod tests { ]); } + #[test] + fn insert_with_select() { + Tester::from("\ninsert into tbl (id) select 1\n\nselect 3") + .expect_statements(vec!["insert into tbl (id) select 1", "select 3"]); + } + + #[test] + fn case() { + Tester::from("select case when select 2 then 1 else 0 end") + .expect_statements(vec!["select case when select 2 then 1 else 0 end"]); + } + + #[test] + fn create_rule() { + Tester::from( + "create rule log_employee_insert as +on insert to employees +do also insert into employee_log (action, employee_id, log_time) +values ('insert', new.id, now());", + ) + .expect_statements(vec![ + "create rule log_employee_insert as +on insert to employees +do also insert into employee_log (action, employee_id, log_time) +values ('insert', new.id, now());", + ]); + } + #[test] fn insert_into() { Tester::from("randomness\ninsert into tbl (id) values (1)\nselect 3").expect_statements( diff --git a/crates/pg_statement_splitter/src/parser.rs b/crates/pg_statement_splitter/src/parser.rs index ece5ec638..e96df99b7 100644 --- a/crates/pg_statement_splitter/src/parser.rs +++ b/crates/pg_statement_splitter/src/parser.rs @@ -1,5 +1,6 @@ mod common; mod data; +mod ddl; mod dml; pub use common::source; @@ -143,6 +144,24 @@ impl Parser { } } + fn look_back(&self) -> Option<&Token> { + // we need to look back to the last relevant token + let mut look_back_pos = self.next_pos - 1; + loop { + let token = self.tokens.get(look_back_pos); + + if look_back_pos == 0 || token.is_none() { + return None; + } + + if !is_irrelevant_token(token.unwrap()) { + return token; + } + + look_back_pos -= 1; + } + } + /// checks if the current token is of `kind` and advances if true /// returns true if the current token is of `kind` pub fn eat(&mut self, kind: SyntaxKind) -> bool { @@ -164,7 +183,7 @@ impl Parser { /// collects an SyntaxError with an `error` message at the current position fn error_at(&mut self, error: String) { - todo!(); + todo!("{error}"); } } diff --git a/crates/pg_statement_splitter/src/parser/common.rs b/crates/pg_statement_splitter/src/parser/common.rs index 6bd5cc814..8fbf5b46c 100644 --- a/crates/pg_statement_splitter/src/parser/common.rs +++ b/crates/pg_statement_splitter/src/parser/common.rs @@ -2,6 +2,7 @@ use pg_lexer::{SyntaxKind, Token, TokenType}; use super::{ data::at_statement_start, + ddl::create, dml::{cte, delete, insert, select, update}, Parser, }; @@ -48,6 +49,9 @@ pub(crate) fn statement(p: &mut Parser) { SyntaxKind::DeleteP => { delete(p); } + SyntaxKind::Create => { + create(p); + } _ => { unknown(p); } @@ -71,6 +75,22 @@ pub(crate) fn parenthesis(p: &mut Parser) { } } +pub(crate) fn case(p: &mut Parser) { + p.expect(SyntaxKind::Case); + + loop { + match p.peek().kind { + SyntaxKind::EndP => { + p.advance(); + break; + } + _ => { + p.advance(); + } + } + } +} + pub(crate) fn unknown(p: &mut Parser) { loop { match p.peek() { @@ -87,19 +107,52 @@ pub(crate) fn unknown(p: &mut Parser) { } => { break; } + Token { + kind: SyntaxKind::Case, + .. + } => { + case(p); + } Token { kind: SyntaxKind::Ascii40, .. } => { parenthesis(p); } - t => { - if at_statement_start(t.kind) { + t => match at_statement_start(t.kind) { + Some(SyntaxKind::Select) => { + // we need to check for `as` here to not break on `select as` + if p.look_back().map(|t| t.kind) != Some(SyntaxKind::As) { + break; + } + p.advance(); + } + Some(SyntaxKind::Insert) | Some(SyntaxKind::Update) | Some(SyntaxKind::DeleteP) => { + let prev = p.look_back().map(|t| t.kind); + if [ + // for create trigger + SyntaxKind::After, + // for create rule + SyntaxKind::On, + // for create rule + SyntaxKind::Also, + // for create rule + SyntaxKind::Instead, + ] + .iter() + .all(|x| Some(x) != prev.as_ref()) + { + break; + } + p.advance(); + } + Some(_) => { break; } - - p.advance(); - } + None => { + p.advance(); + } + }, } } } diff --git a/crates/pg_statement_splitter/src/parser/data.rs b/crates/pg_statement_splitter/src/parser/data.rs index bb425265c..6dd841136 100644 --- a/crates/pg_statement_splitter/src/parser/data.rs +++ b/crates/pg_statement_splitter/src/parser/data.rs @@ -1,5 +1,7 @@ use pg_lexer::SyntaxKind; +// All tokens listed here must be explicitly handled in the `unknown` function to ensure that we do +// not break in the middle of another statement that contains a statement start token. static STATEMENT_START_TOKENS: &[SyntaxKind] = &[ SyntaxKind::With, SyntaxKind::Select, @@ -9,6 +11,6 @@ static STATEMENT_START_TOKENS: &[SyntaxKind] = &[ SyntaxKind::Create, ]; -pub(crate) fn at_statement_start(kind: SyntaxKind) -> bool { - STATEMENT_START_TOKENS.contains(&kind) +pub(crate) fn at_statement_start(kind: SyntaxKind) -> Option { + STATEMENT_START_TOKENS.iter().find(|&x| x == &kind).cloned() } diff --git a/crates/pg_statement_splitter/src/parser/ddl.rs b/crates/pg_statement_splitter/src/parser/ddl.rs new file mode 100644 index 000000000..113a9d38c --- /dev/null +++ b/crates/pg_statement_splitter/src/parser/ddl.rs @@ -0,0 +1,9 @@ +use pg_lexer::SyntaxKind; + +use super::{common::unknown, Parser}; + +pub(crate) fn create(p: &mut Parser) { + p.expect(SyntaxKind::Create); + + unknown(p); +} From 130877043bedb268ff89ba10076630afebe71650 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Sun, 20 Oct 2024 17:43:25 +0200 Subject: [PATCH 10/13] fix: cleanup and fix some clippy warnings (sorry, unrelated) --- crates/pg_base_db/src/change.rs | 45 ++++------- crates/pg_base_db/src/document.rs | 31 +++----- crates/pg_statement_splitter/src/lib.rs | 67 +++++++++++++++- crates/pg_statement_splitter/src/parser.rs | 10 +-- .../src/parser/common.rs | 11 ++- .../pg_statement_splitter/src/parser/data.rs | 10 ++- .../pg_statement_splitter/src/parser/ddl.rs | 8 +- .../pg_statement_splitter/src/parser/dml.rs | 8 +- .../pg_statement_splitter/tests/skipped.txt | 12 --- .../tests/statement_splitter_tests.rs | 78 ------------------- 10 files changed, 122 insertions(+), 158 deletions(-) delete mode 100644 crates/pg_statement_splitter/tests/skipped.txt diff --git a/crates/pg_base_db/src/change.rs b/crates/pg_base_db/src/change.rs index 26a926ffa..a181eee18 100644 --- a/crates/pg_base_db/src/change.rs +++ b/crates/pg_base_db/src/change.rs @@ -42,9 +42,9 @@ impl ChangedStatement { } } -fn apply_text_change(text: &String, range: Option, change_text: &String) -> String { +fn apply_text_change(text: &str, range: Option, change_text: &str) -> String { if range.is_none() { - return change_text.clone(); + return change_text.to_string(); } let range = range.unwrap(); @@ -53,7 +53,7 @@ fn apply_text_change(text: &String, range: Option, change_text: &Stri let mut new_text = String::new(); new_text.push_str(&text[..start]); - new_text.push_str(&change_text); + new_text.push_str(change_text); new_text.push_str(&text[end..]); new_text @@ -97,7 +97,7 @@ impl Change { self.range.is_some() && self.text.len() < self.range.unwrap().len().into() } - pub fn apply_to_text(&self, text: &String) -> String { + pub fn apply_to_text(&self, text: &str) -> String { if self.range.is_none() { return self.text.clone(); } @@ -122,14 +122,10 @@ impl Change { changed_statements.extend( doc.drain_statements() .into_iter() - .map(|s| StatementChange::Deleted(s)), + .map(StatementChange::Deleted), ); // TODO also use errors returned by extract sql statement ranges - doc.statement_ranges = pg_statement_splitter::split(&self.text) - .ranges - .iter() - .map(|r| r.clone()) - .collect(); + doc.statement_ranges = pg_statement_splitter::split(&self.text).ranges.to_vec(); doc.text = self.text.clone(); doc.line_index = LineIndex::new(&doc.text); @@ -155,7 +151,7 @@ impl Change { changed_statements.push(StatementChange::Modified(ChangedStatement { statement: StatementRef { idx: pos, - text: doc.text[r.clone()].to_string(), + text: doc.text[*r].to_string(), document_url: doc.url.clone(), }, // change must be relative to statement @@ -166,15 +162,9 @@ impl Change { // if addition, expand the range // if deletion, shrink the range if self.is_addition() { - *r = TextRange::new( - r.start(), - r.end() + TextSize::from(self.diff_size()), - ); + *r = TextRange::new(r.start(), r.end() + self.diff_size()); } else if self.is_deletion() { - *r = TextRange::new( - r.start(), - r.end() - TextSize::from(self.diff_size()), - ); + *r = TextRange::new(r.start(), r.end() - self.diff_size()); } } else if self.is_addition() { *r += self.diff_size(); @@ -206,7 +196,7 @@ impl Change { { changed_statements.push(StatementChange::Deleted(StatementRef { idx, - text: doc.text[r.clone()].to_string(), + text: doc.text[*r].to_string(), document_url: doc.url.clone(), })); @@ -344,15 +334,14 @@ mod tests { assert_eq!(d.statement_ranges.len(), 2); for r in &pg_statement_splitter::split(&d.text).ranges { - assert_eq!( - d.statement_ranges.iter().position(|x| r == x).is_some(), - true, + assert!( + d.statement_ranges.iter().any(|x| r == x), "should have stmt with range {:#?}", r ); } - assert_eq!(d.statement_ranges[0], TextRange::new(0.into(), 26.into())); + assert_eq!(d.statement_ranges[0], TextRange::new(0.into(), 25.into())); assert_eq!(d.statement_ranges[1], TextRange::new(26.into(), 35.into())); } @@ -364,8 +353,8 @@ mod tests { assert_eq!(d.statement_ranges.len(), 2); - let stmt_1_range = d.statement_ranges[0].clone(); - let stmt_2_range = d.statement_ranges[1].clone(); + let stmt_1_range = d.statement_ranges[0]; + let stmt_2_range = d.statement_ranges[1]; let update_text = " contacts;"; @@ -522,8 +511,8 @@ mod tests { assert_eq!(d.statement_ranges.len(), 2); - let stmt_1_range = d.statement_ranges[0].clone(); - let stmt_2_range = d.statement_ranges[1].clone(); + let stmt_1_range = d.statement_ranges[0]; + let stmt_2_range = d.statement_ranges[1]; let update_text = ",test"; diff --git a/crates/pg_base_db/src/document.rs b/crates/pg_base_db/src/document.rs index a98388332..a8658cd22 100644 --- a/crates/pg_base_db/src/document.rs +++ b/crates/pg_base_db/src/document.rs @@ -1,4 +1,4 @@ -use std::{hash::Hash, hash::Hasher, ops::RangeBounds, usize}; +use std::{hash::Hash, hash::Hasher, ops::RangeBounds}; use line_index::LineIndex; use text_size::{TextRange, TextSize}; @@ -44,18 +44,11 @@ impl Document { pub fn new(url: PgLspPath, text: Option) -> Document { Document { version: 0, - line_index: LineIndex::new(&text.as_ref().unwrap_or(&"".to_string())), + line_index: LineIndex::new(text.as_ref().unwrap_or(&"".to_string())), // TODO: use errors returned by split - statement_ranges: text.as_ref().map_or_else( - || Vec::new(), - |f| { - pg_statement_splitter::split(&f) - .ranges - .iter() - .map(|range| range.clone()) - .collect() - }, - ), + statement_ranges: text.as_ref().map_or_else(Vec::new, |f| { + pg_statement_splitter::split(f).ranges.to_vec() + }), text: text.unwrap_or("".to_string()), url, } @@ -99,7 +92,7 @@ impl Document { .enumerate() .map(|(idx, range)| StatementRef { document_url: self.url.clone(), - text: self.text[range.clone()].to_string(), + text: self.text[range].to_string(), idx, }) .collect() @@ -112,10 +105,10 @@ impl Document { .enumerate() .map(|(idx, range)| { ( - range.clone(), + *range, StatementRef { document_url: self.url.clone(), - text: self.text[range.clone()].to_string(), + text: self.text[*range].to_string(), idx, }, ) @@ -130,7 +123,7 @@ impl Document { .enumerate() .map(|(idx, range)| StatementRef { document_url: self.url.clone(), - text: self.text[range.clone()].to_string(), + text: self.text[*range].to_string(), idx, }) .collect() @@ -142,7 +135,7 @@ impl Document { .get(pos) .map(|range| StatementRef { document_url: self.url.clone(), - text: self.text[range.clone()].to_string(), + text: self.text[*range].to_string(), idx: pos, }) .unwrap() @@ -154,10 +147,10 @@ impl Document { .get(pos) .map(|range| { ( - range.clone(), + *range, StatementRef { document_url: self.url.clone(), - text: self.text[range.clone()].to_string(), + text: self.text[*range].to_string(), idx: pos, }, ) diff --git a/crates/pg_statement_splitter/src/lib.rs b/crates/pg_statement_splitter/src/lib.rs index c9d013a02..ab4bafa87 100644 --- a/crates/pg_statement_splitter/src/lib.rs +++ b/crates/pg_statement_splitter/src/lib.rs @@ -17,6 +17,9 @@ pub fn split(sql: &str) -> Parse { #[cfg(test)] mod tests { use ntest::timeout; + use pg_lexer::SyntaxKind; + use syntax_error::SyntaxError; + use text_size::TextRange; use super::*; @@ -35,18 +38,42 @@ mod tests { } impl Tester { - fn expect_statements(&self, expected: Vec<&str>) { + fn expect_statements(&self, expected: Vec<&str>) -> &Self { assert_eq!( self.parse.ranges.len(), expected.len(), - "Expected {} statements, got {}", + "Expected {} statements, got {}: {:?}", expected.len(), - self.parse.ranges.len() + self.parse.ranges.len(), + self.parse + .ranges + .iter() + .map(|r| &self.input[*r]) + .collect::>() ); for (range, expected) in self.parse.ranges.iter().zip(expected.iter()) { assert_eq!(*expected, self.input[*range].to_string()); } + + self + } + + fn expect_errors(&self, expected: Vec) -> &Self { + assert_eq!( + self.parse.errors.len(), + expected.len(), + "Expected {} errors, got {}: {:?}", + expected.len(), + self.parse.errors.len(), + self.parse.errors + ); + + for (err, expected) in self.parse.errors.iter().zip(expected.iter()) { + assert_eq!(expected, err); + } + + self } } @@ -72,6 +99,16 @@ mod tests { ]); } + #[test] + fn insert_expect_error() { + Tester::from("\ninsert select 1\n\nselect 3") + .expect_statements(vec!["insert select 1", "select 3"]) + .expect_errors(vec![SyntaxError::new( + format!("Expected {:?}", SyntaxKind::Into), + TextRange::new(8.into(), 14.into()), + )]); + } + #[test] fn insert_with_select() { Tester::from("\ninsert into tbl (id) select 1\n\nselect 3") @@ -84,6 +121,28 @@ mod tests { .expect_statements(vec!["select case when select 2 then 1 else 0 end"]); } + #[test] + #[timeout(1000)] + fn simple_select() { + Tester::from( + " +select id, name, test1231234123, unknown from co; + +select 14433313331333 + +alter table test drop column id; + +select lower('test'); +", + ) + .expect_statements(vec![ + "select id, name, test1231234123, unknown from co;", + "select 14433313331333", + "alter table test drop column id;", + "select lower('test');", + ]); + } + #[test] fn create_rule() { Tester::from( @@ -103,7 +162,7 @@ values ('insert', new.id, now());", #[test] fn insert_into() { Tester::from("randomness\ninsert into tbl (id) values (1)\nselect 3").expect_statements( - vec!["randomness", "insert into tbl (id) values (1)", "select 3"], + vec!["randomness", "insert into tbl (id) values (1)\nselect 3"], ); } diff --git a/crates/pg_statement_splitter/src/parser.rs b/crates/pg_statement_splitter/src/parser.rs index e96df99b7..33fcfaf73 100644 --- a/crates/pg_statement_splitter/src/parser.rs +++ b/crates/pg_statement_splitter/src/parser.rs @@ -178,12 +178,10 @@ impl Parser { return; } - self.error_at(format!("Expected {:#?}", kind)); - } - - /// collects an SyntaxError with an `error` message at the current position - fn error_at(&mut self, error: String) { - todo!("{error}"); + self.errors.push(SyntaxError::new( + format!("Expected {:#?}", kind), + self.peek().span, + )); } } diff --git a/crates/pg_statement_splitter/src/parser/common.rs b/crates/pg_statement_splitter/src/parser/common.rs index 8fbf5b46c..c46774089 100644 --- a/crates/pg_statement_splitter/src/parser/common.rs +++ b/crates/pg_statement_splitter/src/parser/common.rs @@ -2,7 +2,7 @@ use pg_lexer::{SyntaxKind, Token, TokenType}; use super::{ data::at_statement_start, - ddl::create, + ddl::{alter, create}, dml::{cte, delete, insert, select, update}, Parser, }; @@ -52,8 +52,11 @@ pub(crate) fn statement(p: &mut Parser) { SyntaxKind::Create => { create(p); } + SyntaxKind::Alter => { + alter(p); + } _ => { - unknown(p); + unknown(p, &[]); } } p.close_stmt(); @@ -91,7 +94,7 @@ pub(crate) fn case(p: &mut Parser) { } } -pub(crate) fn unknown(p: &mut Parser) { +pub(crate) fn unknown(p: &mut Parser, exclude: &[SyntaxKind]) { loop { match p.peek() { Token { @@ -119,7 +122,7 @@ pub(crate) fn unknown(p: &mut Parser) { } => { parenthesis(p); } - t => match at_statement_start(t.kind) { + t => match at_statement_start(t.kind, exclude) { Some(SyntaxKind::Select) => { // we need to check for `as` here to not break on `select as` if p.look_back().map(|t| t.kind) != Some(SyntaxKind::As) { diff --git a/crates/pg_statement_splitter/src/parser/data.rs b/crates/pg_statement_splitter/src/parser/data.rs index 6dd841136..543896ddc 100644 --- a/crates/pg_statement_splitter/src/parser/data.rs +++ b/crates/pg_statement_splitter/src/parser/data.rs @@ -2,6 +2,8 @@ use pg_lexer::SyntaxKind; // All tokens listed here must be explicitly handled in the `unknown` function to ensure that we do // not break in the middle of another statement that contains a statement start token. +// +// All of these statements must have a dedicated parser function called from the `statement` function static STATEMENT_START_TOKENS: &[SyntaxKind] = &[ SyntaxKind::With, SyntaxKind::Select, @@ -9,8 +11,12 @@ static STATEMENT_START_TOKENS: &[SyntaxKind] = &[ SyntaxKind::Update, SyntaxKind::DeleteP, SyntaxKind::Create, + SyntaxKind::Alter, ]; -pub(crate) fn at_statement_start(kind: SyntaxKind) -> Option { - STATEMENT_START_TOKENS.iter().find(|&x| x == &kind).cloned() +pub(crate) fn at_statement_start(kind: SyntaxKind, exclude: &[SyntaxKind]) -> Option<&SyntaxKind> { + STATEMENT_START_TOKENS + .iter() + .filter(|&x| !exclude.contains(x)) + .find(|&x| x == &kind) } diff --git a/crates/pg_statement_splitter/src/parser/ddl.rs b/crates/pg_statement_splitter/src/parser/ddl.rs index 113a9d38c..80119b6f1 100644 --- a/crates/pg_statement_splitter/src/parser/ddl.rs +++ b/crates/pg_statement_splitter/src/parser/ddl.rs @@ -5,5 +5,11 @@ use super::{common::unknown, Parser}; pub(crate) fn create(p: &mut Parser) { p.expect(SyntaxKind::Create); - unknown(p); + unknown(p, &[]); +} + +pub(crate) fn alter(p: &mut Parser) { + p.expect(SyntaxKind::Alter); + + unknown(p, &[]); } diff --git a/crates/pg_statement_splitter/src/parser/dml.rs b/crates/pg_statement_splitter/src/parser/dml.rs index 2a5fa96a5..40e59cea9 100644 --- a/crates/pg_statement_splitter/src/parser/dml.rs +++ b/crates/pg_statement_splitter/src/parser/dml.rs @@ -24,25 +24,25 @@ pub(crate) fn cte(p: &mut Parser) { pub(crate) fn select(p: &mut Parser) { p.expect(SyntaxKind::Select); - unknown(p); + unknown(p, &[]); } pub(crate) fn insert(p: &mut Parser) { p.expect(SyntaxKind::Insert); p.expect(SyntaxKind::Into); - unknown(p); + unknown(p, &[SyntaxKind::Select]); } pub(crate) fn update(p: &mut Parser) { p.expect(SyntaxKind::Update); - unknown(p); + unknown(p, &[]); } pub(crate) fn delete(p: &mut Parser) { p.expect(SyntaxKind::DeleteP); p.expect(SyntaxKind::From); - unknown(p); + unknown(p, &[]); } diff --git a/crates/pg_statement_splitter/tests/skipped.txt b/crates/pg_statement_splitter/tests/skipped.txt deleted file mode 100644 index 480089b91..000000000 --- a/crates/pg_statement_splitter/tests/skipped.txt +++ /dev/null @@ -1,12 +0,0 @@ -brin -brin_bloom -brin_multi -collate.icu.utf8 -collate.linux.utf8 -collate -copy2 -create_table_like -drop_operator -replica_identity -unicode -xmlmap diff --git a/crates/pg_statement_splitter/tests/statement_splitter_tests.rs b/crates/pg_statement_splitter/tests/statement_splitter_tests.rs index fb639fef1..b4ea1de65 100644 --- a/crates/pg_statement_splitter/tests/statement_splitter_tests.rs +++ b/crates/pg_statement_splitter/tests/statement_splitter_tests.rs @@ -1,84 +1,6 @@ use std::fs::{self}; const DATA_DIR_PATH: &str = "tests/data/"; -const POSTGRES_REGRESS_PATH: &str = "../../libpg_query/test/sql/postgres_regress/"; -const SKIPPED_REGRESS_TESTS: &str = include_str!("skipped.txt"); - -#[test] -fn test_postgres_regress() { - // all postgres regress tests are valid and complete statements, so we can use `split_with_parser` and compare with our own splitter - - let mut paths: Vec<_> = fs::read_dir(POSTGRES_REGRESS_PATH) - .unwrap() - .map(|r| r.unwrap()) - .collect(); - paths.sort_by_key(|dir| dir.path()); - - for f in paths.iter() { - let path = f.path(); - - let test_name = path.file_stem().unwrap().to_str().unwrap(); - - // these require fixes in the parser - if SKIPPED_REGRESS_TESTS - .lines() - .collect::>() - .contains(&test_name) - { - continue; - } - - // remove \commands because pg_query doesn't support them - let contents = fs::read_to_string(&path) - .unwrap() - .lines() - .filter(|l| !l.starts_with("\\") && !l.ends_with("\\gset")) - .collect::>() - .join(" "); - - let libpg_query_split = pg_query::split_with_parser(&contents).unwrap(); - - let parser_split = pg_statement_splitter::split(&contents); - - assert_eq!( - parser_split.errors.len(), - 0, - "Unexpected errors when parsing file {}:\n{:#?}", - test_name, - parser_split.errors - ); - - assert_eq!( - libpg_query_split.len(), - parser_split.ranges.len(), - "Mismatch in statement count for file {}: Expected {} statements, got {}", - test_name, - libpg_query_split.len(), - parser_split.ranges.len() - ); - - for (libpg_query_stmt, parser_range) in - libpg_query_split.iter().zip(parser_split.ranges.iter()) - { - let parser_stmt = &contents[parser_range.clone()].trim(); - - let libpg_query_stmt = if libpg_query_stmt.ends_with(';') { - libpg_query_stmt.to_string() - } else { - format!("{};", libpg_query_stmt.trim()) - }; - - let libpg_query_stmt_trimmed = libpg_query_stmt.trim(); - let parser_stmt_trimmed = parser_stmt.trim(); - - assert_eq!( - libpg_query_stmt_trimmed, parser_stmt_trimmed, - "Mismatch in statement {}:\nlibg_query: '{}'\nsplitter: '{}'", - test_name, libpg_query_stmt_trimmed, parser_stmt_trimmed - ); - } - } -} #[test] fn test_statement_splitter() { From e2dba40aa58bf3b9fca6977dfc50e76c53c76870 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Sun, 20 Oct 2024 17:46:30 +0200 Subject: [PATCH 11/13] fix: handle create rule with select --- .../pg_statement_splitter/src/parser/common.rs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/crates/pg_statement_splitter/src/parser/common.rs b/crates/pg_statement_splitter/src/parser/common.rs index c46774089..b723af287 100644 --- a/crates/pg_statement_splitter/src/parser/common.rs +++ b/crates/pg_statement_splitter/src/parser/common.rs @@ -124,10 +124,23 @@ pub(crate) fn unknown(p: &mut Parser, exclude: &[SyntaxKind]) { } t => match at_statement_start(t.kind, exclude) { Some(SyntaxKind::Select) => { - // we need to check for `as` here to not break on `select as` - if p.look_back().map(|t| t.kind) != Some(SyntaxKind::As) { + let prev = p.look_back().map(|t| t.kind); + if [ + // for create view / table as + SyntaxKind::As, + // for create rule + SyntaxKind::On, + // for create rule + SyntaxKind::Also, + // for create rule + SyntaxKind::Instead, + ] + .iter() + .all(|x| Some(x) != prev.as_ref()) + { break; } + p.advance(); } Some(SyntaxKind::Insert) | Some(SyntaxKind::Update) | Some(SyntaxKind::DeleteP) => { From 9f1595337cce622651edcfbce22fb9622e413502 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Sun, 20 Oct 2024 17:48:17 +0200 Subject: [PATCH 12/13] fix: make ntest a dev dep --- crates/pg_statement_splitter/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/pg_statement_splitter/Cargo.toml b/crates/pg_statement_splitter/Cargo.toml index 18664bbef..af00de08f 100644 --- a/crates/pg_statement_splitter/Cargo.toml +++ b/crates/pg_statement_splitter/Cargo.toml @@ -6,8 +6,8 @@ edition = "2021" [dependencies] pg_lexer.workspace = true text-size = "1.1.1" -ntest = "0.9.3" [dev-dependencies] pg_query = "0.8" +ntest = "0.9.3" From 5f084207aa0ee8ba04c1c5edac72dceed5444e86 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Sun, 20 Oct 2024 17:58:43 +0200 Subject: [PATCH 13/13] fix: build error --- crates/pg_lsp/src/server/debouncer/thread.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/pg_lsp/src/server/debouncer/thread.rs b/crates/pg_lsp/src/server/debouncer/thread.rs index 1aa85939c..3d20aaed0 100644 --- a/crates/pg_lsp/src/server/debouncer/thread.rs +++ b/crates/pg_lsp/src/server/debouncer/thread.rs @@ -1,3 +1,5 @@ +#![allow(dead_code)] + use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Mutex}; use std::thread::{self, JoinHandle};