From 821af6b22a6db14daf5ef4b1c01d8b5aa8c9ad91 Mon Sep 17 00:00:00 2001 From: Sam Connelly Date: Fri, 7 Jun 2024 22:35:27 -0400 Subject: [PATCH 1/2] Allow an empty input file to be parsed correctly Yes, it uses a goto statement. Even though goto statements are considered harmful, I think it is okay in this situation. --- utils.cpp | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/utils.cpp b/utils.cpp index 60c5e22..955dd40 100644 --- a/utils.cpp +++ b/utils.cpp @@ -693,28 +693,42 @@ char *get_file_contents (const char *filename) { // find file size (Spencer says it's not a hack) fseek (file, 0, SEEK_END); size = ftell (file); + + // don't panic if the file is empty + if (size == 0) { + read_size = 1; + size = 1; + buffer = (char*) malloc (2); + buffer[0] = '\n'; + goto finished_reading; + } + rewind (file); // If there is a UTF-8 endian marker at the beginning of the file, skip it. - const unsigned char utf8_endian_mark[] = {0xEF, 0xBB, 0xBF}; - bool matched = true; - for (unsigned i = 0; matched && i < sizeof(utf8_endian_mark); i++) { - int c = fgetc(file); - if (c == EOF) { - return NULL; + // Enclosed in braces to allow the goto to happen + { + const unsigned char utf8_endian_mark[] = {0xEF, 0xBB, 0xBF}; + bool matched = true; + for (unsigned i = 0; matched && i < sizeof(utf8_endian_mark); i++) { + int c = fgetc(file); + if (c == EOF) { + return NULL; + } + matched &= (unsigned char)c == utf8_endian_mark[i]; + } + if (matched) { + size -= sizeof(utf8_endian_mark); + } else { + rewind(file); } - matched &= (unsigned char)c == utf8_endian_mark[i]; - } - if (matched) { - size -= sizeof(utf8_endian_mark); - } else { - rewind(file); } // now allocate the memory and read in the contents buffer = (char *) malloc (size + 1); read_size = fread (buffer, 1, size, file); +finished_reading: fclose (file); if (read_size != size) { From c2015dfd22b193008d723ba18e8cf8dd00e94a0e Mon Sep 17 00:00:00 2001 From: Sam Connelly Date: Sun, 9 Jun 2024 15:46:52 -0400 Subject: [PATCH 2/2] Refactor UTF-8 BOM check courtesy of Tari --- utils.cpp | 36 +++++++++--------------------------- 1 file changed, 9 insertions(+), 27 deletions(-) diff --git a/utils.cpp b/utils.cpp index 955dd40..b82f7ed 100644 --- a/utils.cpp +++ b/utils.cpp @@ -693,42 +693,24 @@ char *get_file_contents (const char *filename) { // find file size (Spencer says it's not a hack) fseek (file, 0, SEEK_END); size = ftell (file); - - // don't panic if the file is empty - if (size == 0) { - read_size = 1; - size = 1; - buffer = (char*) malloc (2); - buffer[0] = '\n'; - goto finished_reading; - } - rewind (file); // If there is a UTF-8 endian marker at the beginning of the file, skip it. - // Enclosed in braces to allow the goto to happen - { - const unsigned char utf8_endian_mark[] = {0xEF, 0xBB, 0xBF}; - bool matched = true; - for (unsigned i = 0; matched && i < sizeof(utf8_endian_mark); i++) { - int c = fgetc(file); - if (c == EOF) { - return NULL; - } - matched &= (unsigned char)c == utf8_endian_mark[i]; - } - if (matched) { - size -= sizeof(utf8_endian_mark); - } else { - rewind(file); - } + const unsigned char utf8_endian_mark[] = {0xEF, 0xBB, 0xBF}; + unsigned char file_head[3]; + read_size = fread (file_head, 1, 3, file); + + if (read_size == 3 && memcmp(file_head, utf8_endian_mark, sizeof(utf8_endian_mark)) == 0) + // Skip UTF-8 BOM + size -= sizeof(utf8_endian_mark); + else { + rewind(file); } // now allocate the memory and read in the contents buffer = (char *) malloc (size + 1); read_size = fread (buffer, 1, size, file); -finished_reading: fclose (file); if (read_size != size) {