diff options
| -rw-r--r-- | cudl.c | 346 | ||||
| -rw-r--r-- | cudl.h | 10 | ||||
| -rw-r--r-- | spec.txt | 6 | ||||
| -rw-r--r-- | test.cudl | 7 | 
4 files changed, 363 insertions, 6 deletions
| @@ -4,7 +4,13 @@  #include <string.h>  #include "cudl.h" -#define STRIP_WHITESPACE(text) while (isspace(*text)) text++ +#define STRIP_WHITESPACE(text) while (isspace(*(text))) (text)++ +#define IS_KEY_CHAR(c) (\ +	'a' <= (c) && (c) <= 'z' ||\ +	'A' <= (c) && (c) <= 'Z' ||\ +	'0' <= (c) && (c) <= '9' ||\ +	(c) == '_' || (c) == '-'\ +)  int cudl_err = CUDL_OK; @@ -37,12 +43,23 @@ void cudl_debug(struct cudl_value value) {  			else  				printf("%%false");  			break; +		case CUDL_TAG_STRING: +			printf("\"%s\"", value.data.string); +			break;;  		case CUDL_TAG_ARRAY:  			printf("[");  			for (i = 0; i < value.data.array.length; i++)  				cudl_debug(value.data.array.values[i]);  			printf("]");  			break; +		case CUDL_TAG_MAP: +			printf("{"); +			for (i = 0; i < value.data.map.length; i++) { +				printf("\"%s\": ", value.data.map.fields[i].key); +				cudl_debug(value.data.map.fields[i].value); +			} +			printf("}"); +			break;  		default:  			printf("UNKNOWN");  			break; @@ -89,6 +106,210 @@ static size_t parse_bool_or_null(char *input, struct cudl_value *value) {  	return 0;  } +/* Convert UCS character to utf-8 bytes. + * Return number of bytes generated. + * Sets cudl_error on error. + * Shamelessly lifted from https://github.com/cktan/tomc99 */ +size_t cudl_ucs_to_utf8(int64_t ucs, char utf8[6]) { +	if ( +		0xd800 <= ucs && ucs <= 0xdfff || +		0xfffe <= ucs && ucs <= 0xffff || +		ucs < 0 +	) { +		cudl_err = CUDL_ERR_UNRECOGNISED_UNICODE; +		return 0; +	} + +	/* 0x00000000 - 0x0000007F: +	   0xxxxxxx +	*/ +	if (ucs <= 0x7F) { +		utf8[0] = (unsigned char) ucs; +		return 1; +	} + +	/* 0x00000080 - 0x000007FF: +	   110xxxxx 10xxxxxx +	*/ +	if (ucs <= 0x000007FF) { +		utf8[0] = 0xc0 | (ucs >> 6); +		utf8[1] = 0x80 | (ucs & 0x3f); +		return 2; +	} + +	/* 0x00000800 - 0x0000FFFF: +	   1110xxxx 10xxxxxx 10xxxxxx +	*/ +	if (ucs <= 0x0000FFFF) { +		utf8[0] = 0xe0 | (ucs >> 12); +		utf8[1] = 0x80 | ((ucs >> 6) & 0x3f); +		utf8[2] = 0x80 | (ucs & 0x3f); +		return 3; +	} + +	/* 0x00010000 - 0x001FFFFF: +	   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx +	*/ +	if (ucs <= 0x001FFFFF) { +		utf8[0] = 0xf0 | (ucs >> 18); +		utf8[1] = 0x80 | ((ucs >> 12) & 0x3f); +		utf8[2] = 0x80 | ((ucs >> 6) & 0x3f); +		utf8[3] = 0x80 | (ucs & 0x3f); +		return 4; +	} + +	/* 0x00200000 - 0x03FFFFFF: +	   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx +	*/ +	if (ucs <= 0x03FFFFFF) { +		utf8[0] = 0xf8 | (ucs >> 24); +		utf8[1] = 0x80 | ((ucs >> 18) & 0x3f); +		utf8[2] = 0x80 | ((ucs >> 12) & 0x3f); +		utf8[3] = 0x80 | ((ucs >> 6) & 0x3f); +		utf8[4] = 0x80 | (ucs & 0x3f); +		return 5; +	} + +	/* 0x04000000 - 0x7FFFFFFF: +	   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx +	*/ +	if (ucs <= 0x7FFFFFFF) { +		utf8[0] = 0xfc | (ucs >> 30); +		utf8[1] = 0x80 | ((ucs >> 24) & 0x3f); +		utf8[2] = 0x80 | ((ucs >> 18) & 0x3f); +		utf8[3] = 0x80 | ((ucs >> 12) & 0x3f); +		utf8[4] = 0x80 | ((ucs >> 6) & 0x3f); +		utf8[5] = 0x80 | (ucs & 0x3f); +		return 6; +	} + +	cudl_err = CUDL_ERR_UNRECOGNISED_UNICODE; +	return 0; +} + +/* Parse a string starting after the opening quote. + * Set string to be the contents of the string. + * No memory is allocated if an error occurs. */ +static size_t parse_quoted_string(char *input, char **string) { +	size_t length, capacity; +	char *original_input, *newstring; +	int64_t ucs; +	int ucs_length, i; + +	length = 0; +	capacity = 32; +	original_input = input; +	if ((*string = malloc(capacity)) == NULL) { +		cudl_err = CUDL_ERR_OUT_OF_MEMORY; +		return 0; +	} +	for (;;) { +		if (*input == '\0') { +			cudl_err = CUDL_ERR_UNMATCHED_QUOTE; +			free(*string); +			return 0; +		} +		if (*input == '"') { +			if ((newstring = realloc(*string, length + 1)) == NULL) { +				cudl_err = CUDL_ERR_OUT_OF_MEMORY; +				free(*string); +				return 0; +			} +			*string = newstring; +			(*string)[length] = '\0'; +			input++; +			return input - original_input; +		} +		if (length >= capacity) { +			if ((newstring = realloc(*string, capacity * 2)) == NULL) { +				cudl_err = CUDL_ERR_OUT_OF_MEMORY; +				free(*string); +				return 0; +			} +			*string = newstring; +			capacity *= 2; +		} +		if (*input == '\\') { +			input++; +			switch (*input) { +				case '\0': +					cudl_err = CUDL_ERR_EXPECTED_ESCAPE_SEQUENCE; +					free(*string); +					return 0; +				case 'b': +					(*string)[length++] = '\b'; +					input++; +					break; +				case 't': +					(*string)[length++] = '\t'; +					input++; +					break; +				case 'n': +					(*string)[length++] = '\n'; +					input++; +					break; +				case 'r': +					(*string)[length++] = '\r'; +					input++; +					break; +				case '"': +					(*string)[length++] = '"'; +					input++; +					break; +				case '\\': +					(*string)[length++] = '\\'; +					input++; +					break; +				case 'u': +				case 'U': +					ucs = 0; +					ucs_length = (*input == 'u') ? 4 : 8; +					input++; +					for (i = 0; i < ucs_length; i++) { +						if (input[i] == '\0') { +							cudl_err = CUDL_ERR_EXPECTED_ESCAPE_SEQUENCE; +							free(*string); +							return 0; +						} +						if ('0' <= input[i] && input[i] <= '9') { +							ucs = (ucs << 4) + (input[i] - '0'); +						} else if ('a' <= input[i] && input[i] <= 'z') { +							ucs = (ucs << 4) + (input[i] - 'a' + 10); +						} else if ('A' <= input[i] && input[i] <= 'Z') { +							ucs = (ucs << 4) + (input[i] - 'A' + 10); +						} else { +							cudl_err = CUDL_ERR_EXPECTED_ESCAPE_SEQUENCE; +							free(*string); +							return 0; +						} +					} +					if (length + 6 > capacity) { +						if ((newstring = realloc(*string, capacity * 2)) == NULL) { +							cudl_err = CUDL_ERR_OUT_OF_MEMORY; +							free(*string); +							return 0; +						} +						*string = newstring; +						capacity *= 2; +					} +					length += cudl_ucs_to_utf8(ucs, (*string) + length); +					if (cudl_err) { +						free(*string); +						return 0; +					} +					input += ucs_length; +					break; +				default: +					(*string)[length++] = *input; +					input++; +					break; +			} +		} else { +			(*string)[length++] = *(input++); +		} +	} +} +  static size_t parse_array(char *input, struct cudl_value *value) {  	size_t length, capacity;  	struct cudl_value *values, *newvalues; @@ -127,13 +348,13 @@ static size_t parse_array(char *input, struct cudl_value *value) {  				free(values);  				return 0;  			} +			values = newvalues;  			capacity *= 2;  		}  		input += parse_value(input, values + length);  		if (cudl_err) { -			for (i = 0; i < length; i++) { +			for (i = 0; i < length; i++)  				cudl_deinit_value(values[i]); -			}  			free(values);  			return 0;  		} @@ -141,11 +362,130 @@ static size_t parse_array(char *input, struct cudl_value *value) {  	}  } +static size_t parse_map_key(char *input, char **key) { +	char *original_input; +	switch (*input) { +		case '\0': +			cudl_err = CUDL_ERR_EXPECTED_MAP_KEY; +			return 0; +		case '"': +			input++; +			return parse_quoted_string(input, key) + 1; +		default: +			original_input = input; +			while (IS_KEY_CHAR(*input)) +				input++; +			if (input == original_input) { +				cudl_err = CUDL_ERR_EXPECTED_MAP_KEY; +				return 0; +			} +			if ((*key = malloc(input - original_input + 1)) == NULL) { +				cudl_err = CUDL_ERR_OUT_OF_MEMORY; +				return 0; +			} +			memcpy(*key, original_input, input - original_input); +			(*key)[input - original_input] = '\0'; +			return input - original_input; +	} +} + +static size_t parse_map(char *input, struct cudl_value *value, char end_char) { +	printf("Parsing a map from: %s\n", input); +	char *original_input; +	int i; +	struct cudl_map_field *fields, *newfields; +	size_t length, capacity; + +	original_input = input; +	value->tag = CUDL_TAG_MAP; +	length = 0; +	capacity = 8; +	if ((fields = malloc(capacity * sizeof(struct cudl_map_field))) == NULL) { +		cudl_err = CUDL_ERR_OUT_OF_MEMORY; +		return 0; +	} + +	STRIP_WHITESPACE(input); +	for (;;) { +		printf("Parsing a field from: %s\n", input); +		if (*input == end_char) { +			input++; +			fields = realloc(fields, length * sizeof(struct cudl_map_field)); +			value->data.map.length = length; +			value->data.map.fields = fields; +			return input - original_input; +		} +		if (*input == '\0') { +			cudl_err = CUDL_ERR_UNMATCHED_BRACE; +			for (i = 0; i < length; i++) { +				cudl_deinit_value(fields[i].value); +				free(fields[i].key); +			} +			free(fields); +			return 0; +		} +		if (length >= capacity) { +			if ((newfields = realloc(fields,  2 * capacity * sizeof(struct cudl_map_field))) == NULL) { +				cudl_err = CUDL_ERR_OUT_OF_MEMORY; +				for (i = 0; i < length; i++) { +					cudl_deinit_value(fields[i].value); +					free(fields[i].key); +				} +				free(fields); +				return 0; +			} +			fields = newfields; +			capacity *= 2; +		} +		printf("Parsing a key from: %s\n", input); +		input += parse_map_key(input, &fields[length].key); +		if (cudl_err) { +			for (i = 0; i < length; i++) { +				cudl_deinit_value(fields[i].value); +				free(fields[i].key); +			} +			free(fields); +			return 0; +		} +		STRIP_WHITESPACE(input); +		if (*input != ':') { +			cudl_err = CUDL_ERR_EXPECTED_COLON; +			for (i = 0; i < length; i++) { +				cudl_deinit_value(fields[i].value); +				free(fields[i].key); +			} +			free(fields[length].key); +			free(fields); +			return 0; +		} +		input++; +		STRIP_WHITESPACE(input); +		printf("Parsing a field value from: %s\n", input); +		input += parse_value(input, &fields[length].value); +		if (cudl_err) { +			for (i = 0; i < length; i++) { +				cudl_deinit_value(fields[i].value); +				free(fields[i].key); +			} +			free(fields[length].key); +			free(fields); +			return 0; +		} +		length++; +	} +} +  static size_t _parse_value(char *input, struct cudl_value *value) {  	if (*input == '%')  		return parse_bool_or_null(++input, value) + 1;  	if (*input == '[')  		return parse_array(++input, value) + 1; +	if (*input == '{') +		return parse_map(++input, value, '}') + 1; +	if (*input == '"') { +		value->tag = CUDL_TAG_STRING; +		return parse_quoted_string(++input, &value->data.string) + 1; +	}  	cudl_err = CUDL_ERR_UNRECOGNISED_VALUE;  	return 0;  } @@ -31,16 +31,24 @@ enum {  	CUDL_TAG_NULL,  	CUDL_TAG_BOOL,  	CUDL_TAG_ARRAY, +	CUDL_TAG_MAP, +	CUDL_TAG_STRING,  };  enum {  	CUDL_OK = 0,  	CUDL_ERR_OUT_OF_MEMORY, -	CUDL_ERR_EXPECTED_VALUE,  	CUDL_ERR_READING, +	CUDL_ERR_EXPECTED_VALUE,  	CUDL_ERR_EXPECTED_BOOL_OR_NULL, +	CUDL_ERR_EXPECTED_ESCAPE_SEQUENCE, +	CUDL_ERR_EXPECTED_MAP_KEY, +	CUDL_ERR_EXPECTED_COLON,  	CUDL_ERR_UNMATCHED_BRACK, +	CUDL_ERR_UNMATCHED_BRACE, +	CUDL_ERR_UNMATCHED_QUOTE,  	CUDL_ERR_UNRECOGNISED_VALUE, +	CUDL_ERR_UNRECOGNISED_UNICODE,  };  extern int cudl_err; @@ -1,4 +1,4 @@ -# Spec attempt 5 +# CUDL - Clear and Unmistakable Data Language  * Every file contains 1 value, which may have other values nested inside it.  * A schema can be provided when a file is parsed which gives it's value a type. @@ -17,7 +17,8 @@ An inline-end character is one of the following:  ## Map  A sequence of key:value pairs. No delimeter is needed as every value will have a ending marker. -If a key starts with a quote then it continues until another quote ends it. Quotes can be escaped by using 2 of them. +The first key may be preceeded by whitespace and whitespace can occur before or after the : between the key and value. +If a key starts with a quote then it obeys the same rules as a quoted string.  Otherwise a key must match [A-Za-z0-9_-]+  A map can be preceeded by a { and succeeded by a } @@ -45,6 +46,7 @@ The following escape sequences are available for quoted strings and keys:  \" - quote  \\ - backslash  \uXXXX - unicode XXXX +\UXXXXXXXX - unicode XXXXXXXX  ```  ### Multiline string @@ -2,4 +2,11 @@  	%true  	%false  	[%null %null %false] +	["hello\nfriend\t\tstuff\"" "world"] +	"\U0001f600" +	{ +		testing: "this is a test map" +		i_hope_it_works: {nesting: "nested maps!!!"} +		"here's an interesting key :D": %null +	}  ] | 
