From e59f5f474f8ca9f61d5f96a0edfbc9e4a8248341 Mon Sep 17 00:00:00 2001 From: importer system account Date: Wed, 2 Oct 2024 09:56:52 -0400 Subject: [PATCH] add sling copy scripts, properties adjustments --- .../clone_mysql_database.sh | 30 ++- ...opy_mysql_database_tables_to_clickhouse.sh | 161 +++++++++++ ...anage_cbioportal_databases_tool.properties | 20 +- .../mysql_command_line_functions.sh | 2 +- .../parse_property_file_functions.sh | 12 + .../sling_command_line_functions.sh | 251 ++++++++++++++++++ 6 files changed, 459 insertions(+), 17 deletions(-) create mode 100755 scripts/clickhouse_import_support/copy_mysql_database_tables_to_clickhouse.sh create mode 100644 scripts/clickhouse_import_support/sling_command_line_functions.sh diff --git a/scripts/clickhouse_import_support/clone_mysql_database.sh b/scripts/clickhouse_import_support/clone_mysql_database.sh index 03c17e8d..8feac02c 100755 --- a/scripts/clickhouse_import_support/clone_mysql_database.sh +++ b/scripts/clickhouse_import_support/clone_mysql_database.sh @@ -1,10 +1,19 @@ #!/usr/bin/env bash -# bash declaration dependencies -source parse_property_file_functions.sh -source mysql_command_line_functions.sh - -# non-local environment variables in use +# load dependencies +unset this_script_dir +this_script_dir="$(dirname "$(readlink -f $0)")" +if ! source "$this_script_dir/parse_property_file_functions.sh" ; then + echo "Error : unable to load dependency : $this_script_dir/parse_property_file_functions.sh" >&2 + exit 1 +fi +if ! source "$this_script_dir/mysql_command_line_functions.sh" ; then + echo "Error : unable to load dependency : $this_script_dir/mysql_command_line_functions.sh" >&2 + exit 1 +fi +unset this_script_dir + +# other non-local environment variables in use unset my_properties unset database_table_list unset source_database_name @@ -32,15 +41,16 @@ function initialize_main() { usage return 1 fi - if ! initialize_mysql_command_line_functions ; then # this also purges the mysql credentials from the environment for security + if ! initialize_mysql_command_line_functions ; then usage return 1 fi + remove_credentials_from_properties my_properties # no longer needed - remove for security if [ "$database_to_clone_tables_from" == "blue" ] ; then - source_database_name="${my_properties['blue_database_name']}" + source_database_name="${my_properties['mysql_blue_database_name']}" else if [ "$database_to_clone_tables_from" == "green" ] ; then - source_database_name="${my_properties['green_database_name']}" + source_database_name="${my_properties['mysql_green_database_name']}" else echo "Error : database_to_clone_tables_from must be one of {blue, green}" >&2 usage @@ -48,10 +58,10 @@ function initialize_main() { fi fi if [ "$database_to_clone_tables_to" == "blue" ] ; then - destination_database_name="${my_properties['blue_database_name']}" + destination_database_name="${my_properties['mysql_blue_database_name']}" else if [ "$database_to_clone_tables_to" == "green" ] ; then - destination_database_name="${my_properties['green_database_name']}" + destination_database_name="${my_properties['mysql_green_database_name']}" else echo "Error : database_to_clone_tables_to must be one of {blue, green}" >&2 usage diff --git a/scripts/clickhouse_import_support/copy_mysql_database_tables_to_clickhouse.sh b/scripts/clickhouse_import_support/copy_mysql_database_tables_to_clickhouse.sh new file mode 100755 index 00000000..770cdba8 --- /dev/null +++ b/scripts/clickhouse_import_support/copy_mysql_database_tables_to_clickhouse.sh @@ -0,0 +1,161 @@ +#!/usr/bin/env bash + +# load dependencies +unset this_script_dir +this_script_dir="$(dirname "$(readlink -f $0)")" +if ! source "$this_script_dir/parse_property_file_functions.sh" ; then + echo "Error : unable to load dependency : $this_script_dir/parse_property_file_functions.sh" >&2 + exit 1 +fi +if ! source "$this_script_dir/sling_command_line_functions.sh" ; then + echo "Error : unable to load dependency : $this_script_dir/sling_command_line_functions.sh" >&2 + exit 1 +fi +unset this_script_dir + +function usage() { + echo "usage:" +} + +# other non-local environment variables in use +unset my_properties +unset database_table_list +unset clickhouse_destination_database_name +unset mysql_source_database_name +declare -A my_properties +declare -a database_table_list +declare -A table_has_been_copied_and_verified +clickhouse_destination_database_name="" +mysql_source_database_name="" +database_table_list_filepath="$(pwd)/cmd_database_table_list.txt" + +function initialize_main() { + if ! [ "$database_to_transfer" == "blue" ] && ! [ "$database_to_transfer" == "green" ] ; then + echo "Error : argument for database_to_transfer must be either 'blue' or 'green'" >&2 + return 1 + fi + if ! parse_property_file "$properties_filepath" my_properties ; then + usage + return 1 + fi + if [ "$database_to_transfer" == "blue" ] ; then + clickhouse_destination_database_name="${my_properties['clickhouse_blue_database_name']}" + mysql_source_database_name="${my_properties['mysql_blue_database_name']}" + else + clickhouse_destination_database_name="${my_properties['clickhouse_green_database_name']}" + mysql_source_database_name="${my_properties['mysql_green_database_name']}" + fi + if ! initialize_sling_command_line_functions "$database_to_transfer" ; then + usage + return 1 + fi + remove_credentials_from_properties my_properties +} + +function destination_database_exists_and_is_empty() { + if ! clickhouse_database_exists "$clickhouse_destination_database_name" ; then + echo "Error : could not proceed with database copying because destination database does not exist: $clickhouse_destination_database_name" >&2 + return 1 + fi + if ! clickhouse_database_is_empty "$clickhouse_destination_database_name" ; then + echo "Error : could not proceed with database copying because destination database is not empty: $clickhouse_destination_database_name" >&2 + return 2 + fi + return 0 +} + +function set_database_table_list() { + local statement="SELECT table_name FROM INFORMATION_SCHEMA.tables WHERE table_type='BASE TABLE' AND table_schema='$mysql_source_database_name'" + rm -f "$database_table_list_filepath" + if ! execute_sql_statement_via_sling "$statement" "mysql" "$database_table_list_filepath" ; then + echo "Warning : failed to execute mysql statement : $statement" >&2 + return 1 + fi + unset sql_data_array + if ! set_sql_data_array_from_file "$database_table_list_filepath" 0 ; then + return 1 + fi + database_table_list=(${sql_data_array[@]}) + return 0 +} + +function shutdown_main_and_clean_up() { + #TODO restore + #shutdown_sling_command_line_functions + delete_output_stream_files + unset my_properties + unset database_table_list + unset table_has_been_copied_and_verified + unset database_table_list_filepath + unset record_count_comparison_filepath +} + +function successful_copy_verified_flag_has_been_set() { + local table_name=$1 + if [ "${table_has_been_copied_and_verified[$table_name]}" == "true" ] ; then + return 0 + fi + return 1 +} + +function set_successful_copy_verified_flag() { + local table_name=$1 + table_has_been_copied_and_verified[$table_name]="true" +} + +function copy_all_database_tables_with_sling() { + local pos=0 + local exit_status=0 + while [ $pos -lt ${#database_table_list[@]} ] ; do + table_name="${database_table_list[$pos]}" + if successful_copy_verified_flag_has_been_set "$table_name" ; then + # table successfully copied on a previous pass + continue + fi + echo "attempting to copy data in table $table_name using sling" + if ! transfer_table_data_via_sling "$mysql_source_database_name" "$clickhouse_destination_database_name" "$table_name" "TODOdeletefile" ; then + echo "Warning : failure to copy table $table_name" >&2 + exit_status=1 # any failed table copies cause an eventual failure status to be returned + else + if ! destination_table_matches_source_table "$table_name" ; then + echo "Warning : failure to verify copy of table $table_name" >&2 + exit_status=1 # any failed table copies cause an eventual failure status to be returned + else + set_successful_copy_verified_flag "$table_name" + fi + fi + pos=$(($pos+1)) + done + return $exit_status +} + +function copy_all_database_tables_with_sling_allow_retry() { + local remaining_try_count=3 + while [ $remaining_try_count -ne 0 ] ; do + #TODO record iteration start timestamp + if copy_all_database_tables_with_sling ; then + return 0 + fi + #TODO pause for the minimum try duration (5 minutes?) + remaining_try_count=$((remaining_try_count-1)) + done + return 1 +} + +function main() { + local properties_filepath=$1 + local database_to_transfer=$2 + local exit_status=0 + if ! initialize_main "$properties_filepath" "$database_to_transfer" || + ! destination_database_exists_and_is_empty || + ! set_database_table_list || + ! copy_all_database_tables_with_sling_allow_retry ; then + exit_status=1 + fi + shutdown_main_and_clean_up + return $exit_status +} + +main "$1" "$2" + +exit 0 diff --git a/scripts/clickhouse_import_support/manage_cbioportal_databases_tool.properties b/scripts/clickhouse_import_support/manage_cbioportal_databases_tool.properties index c05e2a44..43a15449 100644 --- a/scripts/clickhouse_import_support/manage_cbioportal_databases_tool.properties +++ b/scripts/clickhouse_import_support/manage_cbioportal_databases_tool.properties @@ -18,10 +18,18 @@ mysql_server_username= mysql_server_password= mysql_server_host_name= +mysql_server_port= mysql_server_additional_args= -disk_capacity_mysql_data_filesystem_megabytes= -disk_usage_invisible_to_mysql_user_megabytes= -disk_consumption_anticipated_during_import_megabytes= -blue_database_name= -green_database_name= -shelved_database_name= +mysql_server_disk_capacity_megabytes= +mysql_server_disk_usage_invisible_to_user_megabytes= +mysql_server_disk_consumption_anticipated_during_import_megabytes= +mysql_blue_database_name= +mysql_green_database_name= +mysql_shelved_database_name= +clickhouse_server_username= +clickhouse_server_password= +clickhouse_server_host_name= +clickhouse_server_port= +clickhouse_server_additional_args= +clickhouse_blue_database_name= +clickhouse_green_database_name= diff --git a/scripts/clickhouse_import_support/mysql_command_line_functions.sh b/scripts/clickhouse_import_support/mysql_command_line_functions.sh index 40e395fb..99a6bbc1 100644 --- a/scripts/clickhouse_import_support/mysql_command_line_functions.sh +++ b/scripts/clickhouse_import_support/mysql_command_line_functions.sh @@ -74,7 +74,7 @@ function set_sql_data_field_value_from_record() { while [ $pos -lt $record_string_length ] ; do local character_at_position="${record_string:$pos:1}" # a newline should occur at the end of the read line, and only there. Embedded newlines are encoded with '\n' - if [ "$character_at_position" == "$NL" ] ; then + if [ "$character_at_position" == "$LF" ] ; then field_index=$((field_index+1)) if [ "$field_index" -gt "$column_number" ] ; then # field has been completely parsed diff --git a/scripts/clickhouse_import_support/parse_property_file_functions.sh b/scripts/clickhouse_import_support/parse_property_file_functions.sh index 37f5a738..cee767db 100644 --- a/scripts/clickhouse_import_support/parse_property_file_functions.sh +++ b/scripts/clickhouse_import_support/parse_property_file_functions.sh @@ -177,3 +177,15 @@ function parse_property_file() { done < $property_file_path return 0 } + +function remove_credentials_from_properties() { + associative_array_name=$1 # array names must be proper identifiers (no spaces) + if ! variable_name_refers_to_an_associative_array $associative_array_name ; then + echo "error: variable name '$associative_array_name' was passed to function parse_property_file() but was not available in the environment, or did not refer to a created associative array." >&2 + return 1 + fi + for key_name in "mysql_server_username" "mysql_server_password" "clickhouse_server_username" "clickhouse_server_password" ; do + unset_command="unset $associative_array_name['$key_name']" + eval $unset_command + done +} diff --git a/scripts/clickhouse_import_support/sling_command_line_functions.sh b/scripts/clickhouse_import_support/sling_command_line_functions.sh new file mode 100644 index 00000000..e60cefc1 --- /dev/null +++ b/scripts/clickhouse_import_support/sling_command_line_functions.sh @@ -0,0 +1,251 @@ +#!/usr/bin/env bash + +unset configured_sling_env_dir_path +unset configured_sling_env_file_path +unset sql_data_field_value +unset sql_data_array +configured_sling_env_dir_path="" +configured_sling_env_file_path="" +sql_data_field_value="" +declare -a sql_data_array +sling_database_exists_filepath="$(pwd)/sclf_database_exists.txt" +sling_database_table_list_filepath="$(pwd)/sclf_database_table_list.txt" + +function write_selected_mysql_connection_to_env_file() { + local env_file=$1 + local database_to_transfer=$2 + local db_name="" + if [ "$database_to_transfer" == "blue" ] ; then + db_name="${my_properties['mysql_blue_database_name']}" + else + db_name="${my_properties['mysql_green_database_name']}" + fi + echo " MYSQL_DATABASE_CONNECTION:" >> "$env_file" + echo " type: mysql" >> "$env_file" + echo " database: $db_name" >> "$env_file" + echo " host: ${my_properties['mysql_server_host_name']}" >> "$env_file" + echo " password: ${my_properties['mysql_server_password']}" >> "$env_file" + echo " port: \"${my_properties['mysql_server_port']}\"" >> "$env_file" + echo " user: ${my_properties['mysql_server_username']}" >> "$env_file" + echo >> "$env_file" +} + +function write_selected_clickhouse_connection_to_env_file() { + local env_file=$1 + local database_to_transfer=$2 + local db_name="" + if [ "$database_to_transfer" == "blue" ] ; then + db_name="${my_properties['clickhouse_blue_database_name']}" + else + db_name="${my_properties['clickhouse_green_database_name']}" + fi + local uname="${my_properties['clickhouse_server_username']}" + local pw="${my_properties['clickhouse_server_password']}" + local clickhost="${my_properties['clickhouse_server_host_name']}" + local clickport="${my_properties['clickhouse_server_port']}" + local additional_args="${my_properties['clickhouse_server_additional_args']}" + echo " CLICKHOUSE_DATABASE_CONNECTION:" >> "$env_file" + echo " type: clickhouse" >> "$env_file" + echo " database: $db_name" >> "$env_file" + echo " host: ${my_properties['clickhouse_server_host_name']}" >> "$env_file" + echo " password: ${my_properties['clickhouse_server_password']}" >> "$env_file" + echo " port: \"${my_properties['clickhouse_server_port']}\"" >> "$env_file" + echo " user: ${my_properties['clickhouse_server_username']}" >> "$env_file" +# echo " http_url: https://$uname:$pw@$clickhost:${clickport}/${db_name}$additional_args" >> "$env_file" + echo >> "$env_file" +} + +function write_sling_env_file() { + local database_to_transfer=$1 + configured_sling_env_dir_path="$(pwd)/sling_env_$(date "+%Y-%m-%d-%H-%M-%S")" + configured_sling_env_file_path="$configured_sling_env_dir_path/env.yaml" + local env_dir="$configured_sling_env_dir_path" + local env_file="$configured_sling_env_file_path" + if ! rm -rf "$env_dir" || ! mkdir "$env_dir" ; then + echo "Error : unable to create sling env.yaml subdirectory $env_dir" + return 1 + fi + chmod 700 "$env_dir" + echo "# Environment Credentials for Sling CLI" > "$env_file" + chmod 600 "$env_file" + echo >> "$env_file" + echo "# See https://docs.slingdata.io/sling-cli/environment" >> "$env_file" + echo >> "$env_file" + echo "connections:" >> "$env_file" + write_selected_mysql_connection_to_env_file "$env_file" "$database_to_transfer" + write_selected_clickhouse_connection_to_env_file "$env_file" "$database_to_transfer" + echo "variables: {}" >> "$env_file" + if ! [ "$(cat $env_file | wc -l)" == "22" ] ; then + echo "Error : could not successfully write default mysql properties to file $env_file" >&2 + return 1 + fi + return 0 +} + +function initialize_sling_command_line_functions() { + local database_to_transfer=$1 + write_sling_env_file "$database_to_transfer" +} + +function shutdown_sling_command_line_functions() { + rm -f "$configured_sling_env_file_path" + rmdir "$configured_sling_env_dir_path" + rm -f "$sling_database_exists_filepath" + rm -f "$sling_database_table_list_filepath" + unset configured_sling_env_dir_path + unset configured_sling_env_file_path + unset sql_data_field_value + unset sql_data_array + unset sling_database_exists_filepath + unset sling_database_table_list_filepath +} + +function execute_sql_statement_via_sling() { + local statement=$1 + local db_server=$2 # must be 'mysql' or 'clickhouse' + local output_filepath=$3 + local sling_connection="" + if [ "$db_server" == "mysql" ] ; then + sling_connection="MYSQL_DATABASE_CONNECTION" + else + if [ "$db_server" == "clickhouse" ] ; then + sling_connection="CLICKHOUSE_DATABASE_CONNECTION" + else + echo "Error : db_server argument to execute_sql_statement_via_list must be 'mysql' or 'clickhouse'. Received : $db_server" + fi + fi + if [ -e "$output_filepath" ] && ! rm -f "$output_filepath" ; then + echo "Error : could not overwrite existing output file $output_filepath when executing mysql statment $statement" >&2 + fi + ( + export DBUS_SESSION_BUS_ADDRESS=/dev/null ; + export SLING_HOME_DIR="$configured_sling_env_dir_path" ; + sling run --src-conn "$sling_connection" --src-stream "$statement" --stdout > "$output_filepath" + ) +} + +function transfer_table_data_via_sling() { + local mysql_source_database_name=$1 + local clickhouse_destination_database_name=$2 + local table_name=$3 + local output_filepath=$4 + ( + export DBUS_SESSION_BUS_ADDRESS=/dev/null ; + export SLING_HOME_DIR="$configured_sling_env_dir_path" ; + export SLING_ALLOW_EMPTY="TRUE" ; + sling run \ + --src-conn MYSQL_DATABASE_CONNECTION \ + --src-stream "$mysql_source_database_name.$table_name" \ + --tgt-conn CLICKHOUSE_DATABASE_CONNECTION \ + --tgt-object "$clickhouse_destination_database_name.$table_name" \ + --stdout \ + > "$output_filepath" + ) +} + +# set_sql_data_field_value_from_record +# This function currently assumes that all queries are going to produce simple results +# which do not cause the quotation of value fields. Output from sling is in comma separated +# value format, so any value which contains a comma, or which contains a quotation mark or +# any of the other reserved characters (such as embedded newlines, embedded quotation marks, etc) +# Currently the only anticipated use of this function is for querying cBioPortal table names +# or record counts from tables. If use of this function expands to more general value retrieval +# it may be necessary to add proper parsing of quoted values. The presence of quotation marks +# on the line currently cuases a failure to parse. +function set_sql_data_field_value_from_record() { + local record_string=$1 + local column_number=$2 + unset sql_data_field_value + local record_string_length=${#record_string} + local LF=$'\n' + local pos=0 + local field_index=0 + local parsed_value="" + while [ $pos -lt $record_string_length ] ; do + local character_at_position="${record_string:$pos:1}" + # no quoted values allowed + if [ "$character_at_position" == "\"" ] ; then + echo "Error : encountered quotation mark (not yet handled) while looking for column $column_number during parsing returned database record : $record_string" >&2 + return 1 + fi + # a newline should occur at the end of the read line, and only there. + if [ "$character_at_position" == "$LF" ] ; then + field_index=$((field_index+1)) + if [ "$field_index" -gt "$column_number" ] ; then + # field has been completely parsed + sql_data_field_value="$parsed_value" + return 0 + fi + echo "Error : unable to locate column $column_number while parsing returned database record : $record_string" >&2 + return 1 + fi + # a comma character delimits the beginning of a new field, and is not part of the field. + if [ "$character_at_position" == "," ] ; then + field_index=$((field_index+1)) + if [ "$field_index" -gt "$column_number" ] ; then + # field has been completely parsed + sql_data_field_value="$parsed_value" + return 0 + fi + pos=$(($pos+1)) + continue + fi + # pass over the current (plain) character + pos=$(($pos+1)) + if [ "$field_index" -eq "$column_number" ] ; then + parsed_value+="$character_at_position" + fi + done + #TODO : add removing of flanking whitespace + sql_data_field_value="$parsed_value" +} + +function set_sql_data_array_from_file() { + local filepath=$1 + local column_number=$2 + unset sql_data_array + if ! [ -r "$filepath" ] ; then + echo "Error : could not read output mysql query results from file : $filepath" >&2 + return 1 + fi + local headers_have_been_parsed=0 + sql_data_array=() + while IFS='' read -r line ; do + if [ "$headers_have_been_parsed" -eq 0 ] ; then + headers_have_been_parsed=1 + else + set_sql_data_field_value_from_record "$line" "$column_number" + sql_data_array+=("$sql_data_field_value") + fi + done < "$filepath" +} + +function clickhouse_database_exists() { + local database_name=$1 + local statement="SELECT COUNT(*) FROM system.databases WHERE name = '$database_name'" + if ! execute_sql_statement_via_sling "$statement" "clickhouse" "$sling_database_exists_filepath" ; then + echo "Warning : unable to determine if database $database_name exists using : $statement" >&2 + return 1 + fi + set_sql_data_array_from_file "$sling_database_exists_filepath" 0 + if [[ "${sql_data_array[0]}" -ne 1 ]] ; then + echo "Warning : database $database_name not present on database server, or there are multiple listings for that name" >&2 + return 2 + fi + return 0 +} + +function clickhouse_database_is_empty() { + local database_name=$1 + local statement="SELECT COUNT(*) FROM INFORMATION_SCHEMA.tables WHERE table_schema='$database_name'" + if ! execute_sql_statement_via_sling "$statement" "clickhouse" "$sling_database_table_list_filepath" ; then + echo "Warning : unable to retrieve table/view list from database $database_name using : $statement" >&2 + return 1 + fi + set_sql_data_array_from_file "$sling_database_table_list_filepath" 0 + if [[ "${sql_data_array[0]}" -ne 0 ]] ; then + echo "Warning : database $database_name has tables or views (is not empty as required)" >&2 + return 2 + fi + return 0 +}