diff --git a/src/readstat.h b/src/readstat.h index faccdbe..bf2c375 100644 --- a/src/readstat.h +++ b/src/readstat.h @@ -104,22 +104,11 @@ typedef enum readstat_error_e { READSTAT_ERROR_TOO_FEW_COLUMNS, READSTAT_ERROR_TOO_MANY_COLUMNS, READSTAT_ERROR_NAME_IS_ZERO_LENGTH, - READSTAT_ERROR_BAD_TIMESTAMP_VALUE, - READSTAT_ERROR_BAD_MR_STRING + READSTAT_ERROR_BAD_TIMESTAMP_VALUE } readstat_error_t; const char *readstat_error_message(readstat_error_t error_code); -typedef struct mr_set_s { - char type; - char *name; - char *label; - int is_dichotomy; - int counted_value; - char **subvariables; - int num_subvars; -} mr_set_t; - typedef struct readstat_metadata_s { int64_t row_count; int64_t var_count; @@ -132,8 +121,6 @@ typedef struct readstat_metadata_s { const char *file_label; const char *file_encoding; unsigned int is64bit:1; - size_t multiple_response_sets_length; - mr_set_t *mr_sets; } readstat_metadata_t; /* If the row count is unknown (e.g. it's an XPORT or POR file, or an SAV @@ -151,8 +138,6 @@ readstat_endian_t readstat_get_endianness(readstat_metadata_t *metadata); const char *readstat_get_table_name(readstat_metadata_t *metadata); const char *readstat_get_file_label(readstat_metadata_t *metadata); const char *readstat_get_file_encoding(readstat_metadata_t *metadata); -const mr_set_t *readstat_get_mr_sets(readstat_metadata_t *metadata); -size_t readstat_get_multiple_response_sets_length(readstat_metadata_t *metadata); typedef struct readstat_value_s { union { diff --git a/src/readstat_metadata.c b/src/readstat_metadata.c index e098b26..2534539 100644 --- a/src/readstat_metadata.c +++ b/src/readstat_metadata.c @@ -43,11 +43,3 @@ const char *readstat_get_file_encoding(readstat_metadata_t *metadata) { const char *readstat_get_table_name(readstat_metadata_t *metadata) { return metadata->table_name; } - -size_t readstat_get_multiple_response_sets_length(readstat_metadata_t *metadata) { - return metadata->multiple_response_sets_length; -} - -const mr_set_t *readstat_get_mr_sets(readstat_metadata_t *metadata) { - return metadata->mr_sets; -} diff --git a/src/spss/readstat_sav.h b/src/spss/readstat_sav.h index e417ac4..c4b68de 100644 --- a/src/spss/readstat_sav.h +++ b/src/spss/readstat_sav.h @@ -3,7 +3,6 @@ // #include "readstat_spss.h" -#include "../readstat.h" #pragma pack(push, 1) @@ -101,9 +100,6 @@ typedef struct sav_ctx_s { uint64_t lowest_double; uint64_t highest_double; - size_t multiple_response_sets_length; - mr_set_t *mr_sets; - double bias; int format_version; @@ -121,7 +117,6 @@ typedef struct sav_ctx_s { #define SAV_RECORD_SUBTYPE_INTEGER_INFO 3 #define SAV_RECORD_SUBTYPE_FP_INFO 4 -#define SAV_RECORD_SUBTYPE_MULTIPLE_RESPONSE_SETS 7 #define SAV_RECORD_SUBTYPE_PRODUCT_INFO 10 #define SAV_RECORD_SUBTYPE_VAR_DISPLAY 11 #define SAV_RECORD_SUBTYPE_LONG_VAR_NAME 13 diff --git a/src/spss/readstat_sav_read.c b/src/spss/readstat_sav_read.c index 2f274ae..d36bab9 100644 --- a/src/spss/readstat_sav_read.c +++ b/src/spss/readstat_sav_read.c @@ -8,14 +8,12 @@ #include #include #include -#include #include "../readstat.h" #include "../readstat_bits.h" #include "../readstat_iconv.h" #include "../readstat_convert.h" #include "../readstat_malloc.h" -#include "../CKHashTable.h" #include "readstat_sav.h" #include "readstat_sav_compress.h" @@ -147,180 +145,6 @@ static readstat_error_t sav_parse_variable_display_parameter_record(sav_ctx_t *c static readstat_error_t sav_parse_machine_integer_info_record(const void *data, size_t data_len, sav_ctx_t *ctx); static readstat_error_t sav_parse_long_string_value_labels_record(const void *data, size_t size, size_t count, sav_ctx_t *ctx); static readstat_error_t sav_parse_long_string_missing_values_record(const void *data, size_t size, size_t count, sav_ctx_t *ctx); -static readstat_error_t sav_read_multiple_response_sets(size_t data_len, sav_ctx_t *ctx); - -static readstat_error_t parse_mr_counted_value(const char **next_part, mr_set_t *result) { - readstat_error_t retval = READSTAT_OK; - if (result->type == 'D') { - result->is_dichotomy = 1; - const char *digit_start = (*next_part); - while (*(*next_part) != ' ' && *(*next_part) != '\0') { - (*next_part)++; - } - int internal_count = (int)strtol(digit_start, NULL, 10); - if (*(*next_part) != ' ') { - retval = READSTAT_ERROR_BAD_MR_STRING; - goto cleanup; - } - (*next_part)++; - digit_start = (*next_part); - for (int i = 0; i < internal_count && isdigit(*(*next_part)); i++) { - (*next_part)++; - } - result->counted_value = (int)strtol(digit_start, NULL, 10); - if (*(*next_part) != ' ' && *(*next_part) != '\0') { - retval = READSTAT_ERROR_BAD_MR_STRING; - goto cleanup; - } - } - else if (result->type == 'C') { - result->is_dichotomy = 0; - result->counted_value = -1; - } -cleanup: - return retval; -} - -static readstat_error_t parse_mr_line(const char *line, mr_set_t *result) { - readstat_error_t retval = READSTAT_OK; - *result = (mr_set_t){0}; - - const char *equals_pos = strchr(line, '='); - if (equals_pos == NULL || equals_pos[1] == '\0') { - retval = READSTAT_ERROR_BAD_MR_STRING; - goto cleanup; - } - - result->type = equals_pos[1]; - int name_length = equals_pos - line; - if ((result->name = malloc(name_length + 1)) == NULL) { - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - strncpy(result->name, line, name_length); - result->name[name_length] = '\0'; - const char *next_part = equals_pos + 2; // Start after the '=' and type character - if ((retval = parse_mr_counted_value(&next_part, result)) != READSTAT_OK) goto cleanup; - if (*next_part != ' ') { - retval = READSTAT_ERROR_BAD_MR_STRING; - goto cleanup; - } - next_part++; - const char *digit_start = next_part; - while (isdigit(*next_part)) { - next_part++; - } - if (*next_part != ' ') { - retval = READSTAT_ERROR_BAD_MR_STRING; - goto cleanup; - } - size_t count = strtoul(digit_start, NULL, 10); - next_part++; // Move past the space after the digits - if (strlen(next_part) < count) { - retval = READSTAT_ERROR_BAD_MR_STRING; - goto cleanup; - } - - result->label = malloc(count + 1); // +1 for the null-terminator - if (result->label == NULL) { - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - strncpy(result->label, next_part, count); - result->label[count] = '\0'; - - next_part += count; - if (*next_part != ' ') { - retval = READSTAT_ERROR_BAD_MR_STRING; - goto cleanup; - } - next_part++; - - char **subvariables = NULL; - int subvar_count = 0; - while (*next_part) { - if (*next_part == ' ') { // Skip any extra spaces - next_part++; - continue; - } - - const char *start = next_part; - while (*next_part && *next_part != ' ') { - next_part++; // Move to the end of the current subvariable - } - - size_t length = next_part - start; - char *subvariable = malloc(length + 1); // Allocate memory for the subvariable - if (subvariable == NULL) { - retval = READSTAT_ERROR_MALLOC; - for (int i = 0; i < subvar_count; i++) { - free(subvariables[i]); - } - free(subvariables); - free(result->label); - result->label = NULL; - goto cleanup; - } - strncpy(subvariable, start, length); - subvariable[length] = '\0'; // Null-terminate the string - - char **temp = realloc(subvariables, (subvar_count + 1) * sizeof(char *)); - if (temp == NULL) { - retval = READSTAT_ERROR_MALLOC; - free(subvariable); - for (int i = 0; i < subvar_count; i++) { - free(subvariables[i]); - } - free(subvariables); - free(result->label); - result->label = NULL; - goto cleanup; - } - subvariables = temp; - subvariables[subvar_count++] = subvariable; // Add the new subvariable to the array - - if (*next_part == ' ') next_part++; // Move past the space - } - - result->subvariables = subvariables; - result->num_subvars = subvar_count; - -cleanup: - return retval; -} - -static readstat_error_t sav_read_multiple_response_sets(size_t data_len, sav_ctx_t *ctx) { - readstat_error_t retval = READSTAT_OK; - - char *mr_string = readstat_malloc(data_len + 1); - if (mr_string == NULL) { - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - mr_string[data_len] = '\0'; - - if (ctx->io->read(mr_string, data_len, ctx->io->io_ctx) < data_len) { - retval = READSTAT_ERROR_PARSE; - goto cleanup; - } - - char *token = strtok(mr_string, "$\n"); - int num_lines = 0; - while (token != NULL) { - if ((ctx->mr_sets = realloc(ctx->mr_sets, (num_lines + 1) * sizeof(mr_set_t))) == NULL) { - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - retval = parse_mr_line(token, &ctx->mr_sets[num_lines]); - if (retval != READSTAT_OK) goto cleanup; - num_lines++; - token = strtok(NULL, "$\n"); - } - ctx->multiple_response_sets_length = num_lines; - -cleanup: - return retval; -} static void sav_tag_missing_double(readstat_value_t *value, sav_ctx_t *ctx) { double fp_value = value->v.double_value; @@ -893,6 +717,10 @@ static readstat_error_t sav_process_row(unsigned char *buffer, size_t buffer_len } if (++offset == col_info->width) { if (++segment_offset < var_info->n_segments) { + if (raw_str_used == 0) { + retval = READSTAT_ERROR_PARSE; + goto done; + } raw_str_used--; } offset = 0; @@ -1511,10 +1339,6 @@ static readstat_error_t sav_parse_records_pass1(sav_ctx_t *ctx) { retval = sav_parse_machine_integer_info_record(data_buf, data_len, ctx); if (retval != READSTAT_OK) goto cleanup; - } else if (subtype == SAV_RECORD_SUBTYPE_MULTIPLE_RESPONSE_SETS) { - retval = sav_read_multiple_response_sets(data_len, ctx); - if (retval != READSTAT_OK) - goto cleanup; } else { if (io->seek(data_len, READSTAT_SEEK_CUR, io->io_ctx) == -1) { retval = READSTAT_ERROR_SEEK; @@ -1842,39 +1666,6 @@ readstat_error_t readstat_parse_sav(readstat_parser_t *parser, const char *path, metadata.file_label = ctx->file_label; - // Replace short MR names with long names - ck_hash_table_t *var_dict = ck_hash_table_init(1024, 8); - for (size_t i = 0; i < ctx->var_count; i++) { - spss_varinfo_t *current_varinfo = ctx->varinfo[i]; - if (current_varinfo != NULL && current_varinfo->name[0] != '\0') { - ck_str_hash_insert(current_varinfo->name, current_varinfo, var_dict); - } - } - for (size_t i = 0; i < ctx->multiple_response_sets_length; i++) { - mr_set_t mr = ctx->mr_sets[i]; - for (size_t j = 0; j < mr.num_subvars; j++) { - char* sv_name_upper = malloc(strlen(mr.subvariables[j]) + 1); - if (sv_name_upper == NULL) { - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - sv_name_upper[strlen(mr.subvariables[j])] = '\0'; - for (int c = 0; mr.subvariables[j][c] != '\0'; c++) { - sv_name_upper[c] = toupper((unsigned char) mr.subvariables[j][c]); - } - spss_varinfo_t *info = (spss_varinfo_t *)ck_str_hash_lookup(sv_name_upper, var_dict); - if (info) { - free(mr.subvariables[j]); - mr.subvariables[j] = info->longname; - } - } - } - if (var_dict) - ck_hash_table_free(var_dict); - - metadata.multiple_response_sets_length = ctx->multiple_response_sets_length; - metadata.mr_sets = ctx->mr_sets; - if (ctx->handle.metadata(&metadata, ctx->user_ctx) != READSTAT_HANDLER_OK) { retval = READSTAT_ERROR_USER_ABORT; goto cleanup; @@ -1887,7 +1678,6 @@ readstat_error_t readstat_parse_sav(readstat_parser_t *parser, const char *path, if ((retval = sav_handle_variables(ctx)) != READSTAT_OK) goto cleanup; - if ((retval = sav_handle_fweight(ctx)) != READSTAT_OK) goto cleanup;