lib/util/Strings.v3

// Copyright 2010 Google Inc. All rights reserved.
// Copyright 2020 Ben L. Titzer. All rights reserved.
// See LICENSE for details of Apache 2.0 license.

// A component that contains a number of utility methods for manipulating
// strings, including hashing, comparing, etc.
component Strings {
	// Return if character {c} is considered whitespace.
	def isWhiteSpace(c: byte) -> bool {
		return c == ' ' || c == '\t' || c == '\r' || c == '\n';
	}
	// Strip all leading and trailing whitespace characters from {str}, allocating
	// a new string.
	def strip(str: string) -> string {
		def WS = isWhiteSpace;
		var range = Ranges.skipLeading(str, WS);
		return Ranges.dup(Ranges.skipTrailing(range, WS));
	}
	// Check if {str} starts with {start}, i.e. the latter is a prefix.
	def startsWith = Ranges.startsWith<byte>;
	// Check if {str} ends with {end}, i.e. the latter is a suffix.
	def endsWith = Ranges.endsWith<byte>;
	// Check if {str} ends with {end} and has {start} many preceding characters.
	def endsWithFrom = Ranges.endsWithFrom<byte>;
	// Splits a string on whitespace into an array of strings.
	// maxsplit: Specifies how many splits to do. Pass -1 to split on "all occurrences."
	def splitOnWhiteSpace(str: string, maxsplit: int) -> Array<string> {
		if (str.length == 0) {
			return [""];
		}
		if (maxsplit == 0) {
			return [str];
		}

		maxsplit = if(maxsplit < 0, int.max, maxsplit);
		var result = Vector<string>.new();
		var timesMatched = 0;
		var strStart = 0, strIdx = strStart;
		while (strIdx < str.length) {
			if (timesMatched >= maxsplit) {
				// Finished matching based on max number of splits
				break;
			}
			if (isWhiteSpace(str[strIdx])) {
				timesMatched++;
				if (strStart != strIdx) {
					// matched whitespace, save split (if substring isn't empty string)
					var substr = Arrays.range(str, strStart, strIdx);

					result.put(substr);
				}
				str = Ranges.dup(Ranges.skipLeading(Arrays.range(str, strIdx, str.length), isWhiteSpace));
				strStart = 0;
				strIdx = strStart;
			} else {
				// did not match, continue looking for match
				strIdx++;
			}
		}
		if (strStart != strIdx || strIdx < str.length) {
			// add rest of str to result
			// don't add an empty string though if separating on WS!
			var substr = Arrays.range(str, strStart, str.length);
			result.put(substr);
		}

		return result.extract();
	}
	// Splits a string into an array of strings.
	// charPred: the character predicate that determines which characters to split on in the string.
	// maxsplit: Specifies how many splits to do. Pass -1 to split on "all occurrences."
	def splitOnChar(str: string, charPred: (byte) -> bool, maxsplit: int) -> Array<string> {
		return split_internal(str, maxsplit, split_char_cond(str, charPred, _), 1);
	}
	private def split_char_cond(str: string, charPred: (byte) -> bool, idx: int) -> bool {
		return charPred(str[idx]);
	}
	// Splits a string into an array of strings.
	// separator: Specifies the separator to use when splitting the string. Pass empty string to split on whitespace.
	// maxsplit: Specifies how many splits to do. Pass -1 to split on "all occurrences."
	def split(str: string, separator: string, maxsplit: int) -> Array<string> {
		if (separator.length == 0) {
			// split on whitespace
			return splitOnWhiteSpace(str, maxsplit);
		}
		if (separator.length == 1) {
			// split on single character
			return splitOnChar(str, byte.==(separator[0], _), maxsplit);
		}

		return split_internal(str, maxsplit, split_cond(str, separator, _), separator.length);
	}
	private def split_cond(str: string, separator: string, idx: int) -> bool {
		return Ranges.startsWith(str[idx ...], separator);
	}
	private def split_internal<T>(str: string, maxsplit: int, cond: (int) -> bool, sep_len: int) -> Array<string> {
		if (str.length == 0) {
			return [""];
		}
		if (maxsplit == 0) {
			return [str];
		}

		maxsplit = if(maxsplit < 0, int.max, maxsplit);

		var result = Vector<string>.new();
		var timesMatched = 0;
		var strStart = 0, strIdx = strStart;
		while (strIdx < str.length) {
			if (timesMatched >= maxsplit) {
				// Finished matching based on max number of splits
				break;
			}
			// non-default case, match on the separator
			// check if substr from strIdx to str.length startswith the separator
			if (cond(strIdx)) {
				// matched the separator! save split
				timesMatched++;
				var substr = Arrays.range(str, strStart, strIdx);
				result.put(substr);

				// skip over the separator
				strIdx = strIdx + sep_len;
				strStart = strIdx;
			} else {
				// did not match separator fully, try again
				strIdx++;
			}
		}
		// add rest of str to result
		// don't add an empty string though if separating on WS!
		var substr = Arrays.range(str, strStart, str.length);
		result.put(substr);

		return result.extract();
	}
	// Compute a hash code of {str}.
	def hash(str: string) -> int {
		var hashval = str.length;
		for (c in str) hashval = hashval * 31 + c;
		return hashval;
	}
	// Compare two strings for equality.
	def equal(arr1: string, arr2: string) -> bool {
		if (arr1 == arr2) return true;
		if (arr1.length != arr2.length) return false;
		for (i < arr1.length) {
			if (arr1[i] != arr2[i]) return false;
		}
		return true;
	}
	// Create a new {HashMap} with {string} as the key type.
	def newMap<V>() -> HashMap<string, V> {
		return HashMap.new(hash, equal);
	}
	// Render an integer {val} as decimal into {buf} at {pos},
	// assuming sufficient space at the end of the array.
	def renderDecimal(buf: Array<byte>, pos: int, val: int) -> int {
		if (val < 0) {
			// handle negative integers
			if (val == -2147483648) {
				// special case
				Arrays.copyInto("-2147483648", buf, pos);
				return pos + 11;
			}
			buf[pos++] = '-';
			val = 0 - val;
		}
		if (val < 10) {
			// most common case: small one-byte integer
			buf[pos] = Chars.hexMap_u[val];
			return pos + 1;
		} else if (val < 100) {
			// second most common case: two-byte integer
			buf[pos + 1] = Chars.hexMap_u[val % 10];
			buf[pos] = Chars.hexMap_u[val / 10];
			return pos + 2;
		} else {
			// convert decimal integer
			var nonZero = false;
			for (radix = 1000000000; radix > 0; radix = radix / 10) {
				var digit = val / radix;
				val = val % radix;
				if (digit != 0) nonZero = true;
				if (nonZero) buf[pos++] = Chars.hexMap_u[digit];
			}
			return pos;
		}
	}
	// render an integer {val} as hexadecimal into {buf} at {pos}
	def renderHex8(buf: Array<byte>, pos: int, val: int) -> int {
		var end = pos + 7;
		for (i < 8) {
			buf[end - i] = Chars.hexMap_u[val & 0xf];
			val = val >>> 4;
		}
		return end + 1;
	}
	// produce a string from a {render} function that renders into a StringBuilder
	def render(render: StringBuilder -> StringBuilder) -> string {
		return render(StringBuilder.new()).toString();
	}
	// Allocate a new string buffer and copy {str} into it.
	def builderOf(str: string) -> StringBuilder {
		return StringBuilder.new().puts(str);
	}
	// Render the {fmt} format string with the given parameter {a} and return a string.
	def format1<A>(fmt: string, a: A) -> string {
		return StringBuilder.new().put1(fmt, a).toString();
	}
	// Render the {fmt} format string with the given parameters {a} and {b} and return a string.
	def format2<A, B>(fmt: string, a: A, b: B) -> string {
		return StringBuilder.new().put2(fmt, a, b).toString();
	}
	// Render the {fmt} format string with the given parameters {a}, {b}, and {c} and return a
	// string.
	def format3<A, B, C>(fmt: string, a: A, b: B, c: C) -> string {
		return StringBuilder.new().put3(fmt, a, b, c).toString();
	}
	// Parse a double-quoted string constant in {a} at {pos},
	// returning a pair of status (# success characters if > 0) and the value.
	def parseLiteral(a: Array<byte>, pos: int) -> (int, string) {
		var max = a.length;
		if (pos >= max) return (0, null);
		if (a[pos] != '\"') return (0, null);
		var i = pos + 1, buf = StringBuilder.new();
		while (i < max) {
			var ch = a[i];
			if (ch < ' ') return (pos - i, null);
			if (ch == '\"') return (1 + i - pos, buf.toString());
			if (ch == '\\') {
				var p = Chars.parseEscape(a, i + 1);
				if (p.0 <= 0) return (pos - i + p.0, null);
				else buf.putc(p.1);
				i = i + p.0;
			} else if (ch > 127) {
				var p = Utf8.decode1(a[i ... max]);
				if (p.0 <= 0) return (pos - i, null);
				else buf.putr(a[i ..+ p.0]);
				i += p.0 - 1;
			} else {
				buf.putc(ch);
			}
			i++;
		}
		return (pos - i, null);
	}
	// Returns a closure that appends a string {s} to a {StringBuilder}.
	def puts(s: string) -> StringBuilder -> StringBuilder {
		return StringBuilder.puts(_, s);
	}
	// Compares two strings according to 8-bit ASCII lexicographical order.
	def asciiLt(a: string, b: string) -> bool {
		if (a == null) return b != null; // null < non-null
		if (b == null) return false;     // non-null > null
		for (i < a.length) {
			if (i >= b.length) return false;
			var ac = a[i], bc = b[i];
			if (ac < bc) return true;
			if (bc < ac) return false;
		}
		return a.length < b.length;
	}
	// Returns the input string {s} if {s != null}, or the string "<null>" if {s == null}.
	def nonnull(s: string) -> string {
		return if(s == null, "<null>", s);
	}
}