diff --git a/Cargo.lock b/Cargo.lock index 5ee420c698ee..4ea9552fe4ff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2817,6 +2817,7 @@ dependencies = [ "tikv-jemalloc-ctl", "tikv-jemalloc-sys", "tokio", + "unicode-segmentation", "uuid", ] @@ -4753,7 +4754,6 @@ dependencies = [ "tonic", "tower", "typetag", - "unicode-segmentation", "url", "uuid", "walkdir", diff --git a/src/common/base/Cargo.toml b/src/common/base/Cargo.toml index 7d294c83525d..20bf5f18b47a 100644 --- a/src/common/base/Cargo.toml +++ b/src/common/base/Cargo.toml @@ -56,6 +56,7 @@ state = "0.5" tikv-jemalloc-ctl = { workspace = true } tikv-jemalloc-sys = "0.5.2" tokio = { workspace = true } +unicode-segmentation = "1.10.1" uuid = { workspace = true } [dev-dependencies] diff --git a/src/common/base/src/base/mod.rs b/src/common/base/src/base/mod.rs index 6bb825c8f110..d4e8faefa89b 100644 --- a/src/common/base/src/base/mod.rs +++ b/src/common/base/src/base/mod.rs @@ -47,6 +47,7 @@ pub use string::escape_for_key; pub use string::format_byte_size; pub use string::mask_connection_info; pub use string::mask_string; +pub use string::short_sql; pub use string::unescape_for_key; pub use string::unescape_string; pub use take_mut::take_mut; diff --git a/src/common/base/src/base/string.rs b/src/common/base/src/base/string.rs index 0e50933f13d2..d81013fc3ad8 100644 --- a/src/common/base/src/base/string.rs +++ b/src/common/base/src/base/string.rs @@ -17,6 +17,7 @@ use std::string::FromUtf8Error; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use regex::Regex; +use unicode_segmentation::UnicodeSegmentation; /// Function that escapes special characters in a string. /// @@ -188,3 +189,33 @@ pub fn mask_connection_info(sql: &str) -> String { masked_sql } + +/// Maximum length of the SQL query to be displayed or log. +/// If the query exceeds this length and starts with keywords, +/// it will be truncated and appended with the remaining length. +pub fn short_sql(sql: String) -> String { + const MAX_LENGTH: usize = 128; + let keywords = ["INSERT"]; + + fn starts_with_any(query: &str, keywords: &[&str]) -> bool { + keywords + .iter() + .any(|&keyword| query.to_uppercase().starts_with(keyword)) + } + + let query = sql.trim_start(); + + // Graphemes represent user-perceived characters, which might be composed + // of multiple Unicode code points. + // This ensures that we handle complex characters like emojis or + // accented characters properly. + if query.graphemes(true).count() > MAX_LENGTH && starts_with_any(query, &keywords) { + let truncated: String = query.graphemes(true).take(MAX_LENGTH).collect(); + let original_length = query.graphemes(true).count(); + let remaining_length = original_length.saturating_sub(MAX_LENGTH); + // Append the remaining length indicator + truncated + &format!("...[{} more characters]", remaining_length) + } else { + query.to_string() + } +} diff --git a/src/common/base/tests/it/string.rs b/src/common/base/tests/it/string.rs index fe35b316ece6..4b49c57f2673 100644 --- a/src/common/base/tests/it/string.rs +++ b/src/common/base/tests/it/string.rs @@ -14,6 +14,7 @@ use databend_common_base::base::*; use databend_common_exception::Result; +use unicode_segmentation::UnicodeSegmentation; #[test] fn test_progress() -> Result<()> { @@ -107,3 +108,47 @@ fn test_mask_connection_info() { assert_eq!(expect, actual); } + +#[test] +fn test_short_sql() { + // Test case 1: SQL query shorter than 128 characters + let sql1 = "SELECT * FROM users WHERE id = 1;".to_string(); + assert_eq!(short_sql(sql1.clone()), sql1); + + // Test case 2: SQL query longer than 128 characters and starts with "INSERT" + let long_sql_insert = "INSERT INTO users (id, name, email) VALUES ".to_string() + + &"(1, 'John Doe', 'john@example.com'), ".repeat(5); // Make sure this creates a string longer than 128 characters + let expected_length_insert = long_sql_insert.graphemes(true).count().saturating_sub(128); + let expected_result_insert = { + let truncated: String = long_sql_insert.graphemes(true).take(128).collect(); + truncated + &format!("...[{} more characters]", expected_length_insert) + }; + assert_eq!(short_sql(long_sql_insert.clone()), expected_result_insert); + + // Test case 3: SQL query longer than 128 characters but does not start with "INSERT" + let long_sql_update = + "UPDATE users SET name = 'John' WHERE id = 1;".to_string() + &"id = 1 OR ".repeat(20); // Make sure this creates a string longer than 128 characters + assert_eq!(short_sql(long_sql_update.clone()), long_sql_update); + + // Test case 4: Empty SQL query + let empty_sql = "".to_string(); + assert_eq!(short_sql(empty_sql.clone()), empty_sql); + + // Test case 5: SQL query with leading whitespace + let sql_with_whitespace = + " INSERT INTO users (id, name, email) VALUES (1, 'John Doe', 'john@example.com');" + .to_string(); + let trimmed_sql = sql_with_whitespace.trim_start().to_string(); + assert_eq!(short_sql(sql_with_whitespace.clone()), trimmed_sql); + + // Test case 6: SQL query with multiple emojis to test truncation at an emoji point + let emoji_sql = "INSERT INTO users (id, name) VALUES (1, 'John Doe 😊😊😊😊😊😊😊😊😊😊');" + .to_string() + + &" more text to exceed 128 characters ".repeat(3); + let expected_emoji_result = { + let truncated: String = emoji_sql.graphemes(true).take(128).collect(); + let remaining_length = emoji_sql.graphemes(true).count().saturating_sub(128); + truncated + &format!("...[{} more characters]", remaining_length) + }; + assert_eq!(short_sql(emoji_sql.clone()), expected_emoji_result); +} diff --git a/src/query/service/Cargo.toml b/src/query/service/Cargo.toml index 08c7dfe9d8cc..1bb11b4d951b 100644 --- a/src/query/service/Cargo.toml +++ b/src/query/service/Cargo.toml @@ -174,7 +174,6 @@ tokio-stream = { workspace = true, features = ["net"] } toml = { version = "0.8", default-features = false } tonic = { workspace = true } typetag = { workspace = true } -unicode-segmentation = "1.10.1" uuid = { workspace = true } walkdir = { workspace = true } xorf = { version = "0.11.0", default-features = false, features = ["binary-fuse"] } diff --git a/src/query/service/src/interpreters/interpreter.rs b/src/query/service/src/interpreters/interpreter.rs index 06d479a072a6..868a53513451 100644 --- a/src/query/service/src/interpreters/interpreter.rs +++ b/src/query/service/src/interpreters/interpreter.rs @@ -18,6 +18,7 @@ use std::time::SystemTime; use databend_common_ast::ast::Literal; use databend_common_ast::ast::Statement; +use databend_common_base::base::short_sql; use databend_common_base::runtime::profile::get_statistics_desc; use databend_common_base::runtime::profile::ProfileDesc; use databend_common_base::runtime::profile::ProfileStatisticsName; @@ -46,7 +47,6 @@ use crate::pipelines::executor::ExecutorSettings; use crate::pipelines::executor::PipelineCompleteExecutor; use crate::pipelines::executor::PipelinePullingExecutor; use crate::pipelines::PipelineBuildResult; -use crate::sessions::short_sql; use crate::sessions::QueryContext; use crate::sessions::SessionManager; use crate::stream::DataBlockStream; diff --git a/src/query/service/src/servers/http/clickhouse_handler.rs b/src/query/service/src/servers/http/clickhouse_handler.rs index 1073b05721d8..bb8327b00eae 100644 --- a/src/query/service/src/servers/http/clickhouse_handler.rs +++ b/src/query/service/src/servers/http/clickhouse_handler.rs @@ -16,6 +16,7 @@ use std::collections::HashMap; use std::sync::Arc; use async_stream::stream; +use databend_common_base::base::short_sql; use databend_common_base::base::tokio; use databend_common_base::base::tokio::sync::mpsc::Sender; use databend_common_base::base::tokio::task::JoinHandle; @@ -64,7 +65,6 @@ use crate::interpreters::InterpreterFactory; use crate::interpreters::InterpreterPtr; use crate::servers::http::middleware::sanitize_request_headers; use crate::servers::http::v1::HttpQueryContext; -use crate::sessions::short_sql; use crate::sessions::QueriesQueueManager; use crate::sessions::QueryContext; use crate::sessions::QueryEntry; diff --git a/src/query/service/src/servers/http/v1/query/http_query.rs b/src/query/service/src/servers/http/v1/query/http_query.rs index f3f298f69155..abc36c524417 100644 --- a/src/query/service/src/servers/http/v1/query/http_query.rs +++ b/src/query/service/src/servers/http/v1/query/http_query.rs @@ -20,6 +20,7 @@ use std::sync::Arc; use std::time::Duration; use std::time::Instant; +use databend_common_base::base::short_sql; use databend_common_base::base::tokio; use databend_common_base::base::tokio::sync::Mutex as TokioMutex; use databend_common_base::base::tokio::sync::RwLock; @@ -60,7 +61,6 @@ use crate::servers::http::v1::HttpQueryManager; use crate::servers::http::v1::QueryError; use crate::servers::http::v1::QueryResponse; use crate::servers::http::v1::QueryStats; -use crate::sessions::short_sql; use crate::sessions::QueryAffect; use crate::sessions::Session; use crate::sessions::SessionType; diff --git a/src/query/service/src/sessions/mod.rs b/src/query/service/src/sessions/mod.rs index b8a8c4c945c7..51df83e4a707 100644 --- a/src/query/service/src/sessions/mod.rs +++ b/src/query/service/src/sessions/mod.rs @@ -29,7 +29,6 @@ pub use databend_common_catalog::table_context::TableContext; pub use query_affect::QueryAffect; pub use query_ctx::convert_query_log_timestamp; pub use query_ctx::QueryContext; -pub use query_ctx_shared::short_sql; pub use query_ctx_shared::QueryContextShared; pub use queue_mgr::AcquireQueueGuard; pub use queue_mgr::QueriesQueueManager; diff --git a/src/query/service/src/sessions/query_ctx_shared.rs b/src/query/service/src/sessions/query_ctx_shared.rs index 2b3673cfaf24..6639bb80f6c9 100644 --- a/src/query/service/src/sessions/query_ctx_shared.rs +++ b/src/query/service/src/sessions/query_ctx_shared.rs @@ -23,6 +23,7 @@ use std::time::Duration; use std::time::SystemTime; use dashmap::DashMap; +use databend_common_base::base::short_sql; use databend_common_base::base::Progress; use databend_common_base::runtime::drop_guard; use databend_common_base::runtime::Runtime; @@ -579,28 +580,3 @@ impl Drop for QueryContextShared { }) } } - -pub fn short_sql(sql: String) -> String { - use unicode_segmentation::UnicodeSegmentation; - const MAX_LENGTH: usize = 128; - - let query = sql.trim_start(); - if query.as_bytes().len() > MAX_LENGTH && query.as_bytes()[..6].eq_ignore_ascii_case(b"INSERT") - { - let mut result = Vec::new(); - let mut bytes_taken = 0; - for grapheme in query.graphemes(true) { - let grapheme_bytes = grapheme.as_bytes(); - if bytes_taken + grapheme_bytes.len() <= MAX_LENGTH { - result.extend_from_slice(grapheme_bytes); - bytes_taken += grapheme_bytes.len(); - } else { - break; - } - } - result.extend_from_slice(b"..."); - String::from_utf8(result).unwrap() // by construction, this cannot panic as we extracted unicode graphemes - } else { - sql - } -} diff --git a/src/query/service/tests/it/sessions/query_ctx.rs b/src/query/service/tests/it/sessions/query_ctx.rs index 438a1424c5fc..0175eb5d39f8 100644 --- a/src/query/service/tests/it/sessions/query_ctx.rs +++ b/src/query/service/tests/it/sessions/query_ctx.rs @@ -17,7 +17,6 @@ use databend_common_exception::Result; use databend_common_meta_app::storage::StorageFsConfig; use databend_common_meta_app::storage::StorageParams; use databend_common_meta_app::storage::StorageS3Config; -use databend_query::sessions::short_sql; use databend_query::sessions::TableContext; use databend_query::test_kits::ConfigBuilder; use databend_query::test_kits::TestFixture; @@ -66,29 +65,3 @@ async fn test_get_storage_accessor_fs() -> Result<()> { Ok(()) } - -#[test] -fn test_short_sql() { - // Test case 1: SQL query shorter than 128 bytes - let sql1 = "SELECT * FROM users WHERE id = 1;".to_string(); - assert_eq!(short_sql(sql1.clone()), sql1); - - // Test case 2: SQL query longer than 128 bytes and starts with "INSERT" - let long_sql = "INSERT INTO users (id, name, email) VALUES ".to_string() - + &"(1, 'John Doe', 'john@example.com'), ".repeat(5); // Adjusted for 128 bytes - let expected_result = long_sql.as_bytes()[..128].to_vec(); - let expected_result = String::from_utf8(expected_result).unwrap() + "..."; - assert_eq!(short_sql(long_sql), expected_result); - - // Test case 3: SQL query longer than 128 bytes but does not start with "INSERT" - let long_sql = "SELECT * FROM users WHERE ".to_string() + &"id = 1 OR ".repeat(20); // Adjusted for 128 bytes - assert_eq!(short_sql(long_sql.clone()), long_sql); - - // Test case 4: Empty SQL query - let empty_sql = "".to_string(); - assert_eq!(short_sql(empty_sql.clone()), empty_sql); - - // Test case 5: SQL query with leading whitespace - let sql_with_whitespace = " SELECT * FROM users;".to_string(); - assert_eq!(short_sql(sql_with_whitespace.clone()), sql_with_whitespace); -}