From 52acddf36c8cb3778ab2098a0d95cc2e375a4069 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Mon, 24 Apr 2023 18:20:10 -0400 Subject: string-list: multi-delimiter `string_list_split_in_place()` Enhance `string_list_split_in_place()` to accept multiple characters as delimiters instead of a single character. Instead of using `strchr(2)` to locate the first occurrence of the given delimiter character, `string_list_split_in_place_multi()` uses `strcspn(2)` to move past the initial segment of characters comprised of any characters in the delimiting set. When only a single delimiting character is provided, `strpbrk(2)` (which is implemented with `strcspn(2)`) has equivalent performance to `strchr(2)`. Modern `strcspn(2)` implementations treat an empty delimiter or the singleton delimiter as a special case and fall back to calling strchrnul(). Both glibc[1] and musl[2] implement `strcspn(2)` this way. This change is one step to removing `strtok(2)` from the tree. Note that `string_list_split_in_place()` is not a strict replacement for `strtok()`, since it will happily turn sequential delimiter characters into empty entries in the resulting string_list. For example: string_list_split_in_place(&xs, "foo:;:bar:;:baz", ":;", -1) would yield a string list of: ["foo", "", "", "bar", "", "", "baz"] Callers that wish to emulate the behavior of strtok(2) more directly should call `string_list_remove_empty_items()` after splitting. To avoid regressions for the new multi-character delimter cases, update t0063 in this patch as well. [1]: https://sourceware.org/git/?p=glibc.git;a=blob;f=string/strcspn.c;hb=glibc-2.37#l35 [2]: https://git.musl-libc.org/cgit/musl/tree/src/string/strcspn.c?h=v1.2.3#n11 Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- t/helper/test-string-list.c | 4 ++-- t/t0063-string-list.sh | 51 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) (limited to 't') diff --git a/t/helper/test-string-list.c b/t/helper/test-string-list.c index 2123dda85b..63df88575c 100644 --- a/t/helper/test-string-list.c +++ b/t/helper/test-string-list.c @@ -62,7 +62,7 @@ int cmd__string_list(int argc, const char **argv) struct string_list list = STRING_LIST_INIT_NODUP; int i; char *s = xstrdup(argv[2]); - int delim = *argv[3]; + const char *delim = argv[3]; int maxsplit = atoi(argv[4]); i = string_list_split_in_place(&list, s, delim, maxsplit); @@ -111,7 +111,7 @@ int cmd__string_list(int argc, const char **argv) */ if (sb.len && sb.buf[sb.len - 1] == '\n') strbuf_setlen(&sb, sb.len - 1); - string_list_split_in_place(&list, sb.buf, '\n', -1); + string_list_split_in_place(&list, sb.buf, "\n", -1); string_list_sort(&list); diff --git a/t/t0063-string-list.sh b/t/t0063-string-list.sh index 46d4839194..1fee6d9010 100755 --- a/t/t0063-string-list.sh +++ b/t/t0063-string-list.sh @@ -18,6 +18,14 @@ test_split () { " } +test_split_in_place() { + cat >expected && + test_expect_success "split (in place) $1 at $2, max $3" " + test-tool string-list split_in_place '$1' '$2' '$3' >actual && + test_cmp expected actual + " +} + test_split "foo:bar:baz" ":" "-1" < Date: Mon, 24 Apr 2023 18:20:17 -0400 Subject: t/helper/test-hashmap.c: avoid using `strtok()` Avoid using the non-reentrant `strtok()` to separate the parts of each incoming command. Instead of replacing it with `strtok_r()`, let's instead use the more friendly pair of `string_list_split_in_place()` and `string_list_remove_empty_items()`. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- t/helper/test-hashmap.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 't') diff --git a/t/helper/test-hashmap.c b/t/helper/test-hashmap.c index 36ff07bd4b..0eb0b3d49c 100644 --- a/t/helper/test-hashmap.c +++ b/t/helper/test-hashmap.c @@ -2,6 +2,7 @@ #include "git-compat-util.h" #include "hashmap.h" #include "strbuf.h" +#include "string-list.h" struct test_entry { @@ -150,6 +151,7 @@ static void perf_hashmap(unsigned int method, unsigned int rounds) */ int cmd__hashmap(int argc, const char **argv) { + struct string_list parts = STRING_LIST_INIT_NODUP; struct strbuf line = STRBUF_INIT; int icase; struct hashmap map = HASHMAP_INIT(test_entry_cmp, &icase); @@ -159,21 +161,26 @@ int cmd__hashmap(int argc, const char **argv) /* process commands from stdin */ while (strbuf_getline(&line, stdin) != EOF) { - char *cmd, *p1 = NULL, *p2 = NULL; + char *cmd, *p1, *p2; unsigned int hash = 0; struct test_entry *entry; /* break line into command and up to two parameters */ - cmd = strtok(line.buf, DELIM); + string_list_setlen(&parts, 0); + string_list_split_in_place(&parts, line.buf, DELIM, 2); + string_list_remove_empty_items(&parts, 0); + /* ignore empty lines */ - if (!cmd || *cmd == '#') + if (!parts.nr) + continue; + if (!*parts.items[0].string || *parts.items[0].string == '#') continue; - p1 = strtok(NULL, DELIM); - if (p1) { + cmd = parts.items[0].string; + p1 = parts.nr >= 1 ? parts.items[1].string : NULL; + p2 = parts.nr >= 2 ? parts.items[2].string : NULL; + if (p1) hash = icase ? strihash(p1) : strhash(p1); - p2 = strtok(NULL, DELIM); - } if (!strcmp("add", cmd) && p1 && p2) { @@ -260,6 +267,7 @@ int cmd__hashmap(int argc, const char **argv) } } + string_list_clear(&parts, 0); strbuf_release(&line); hashmap_clear_and_free(&map, struct test_entry, ent); return 0; -- cgit v1.3 From deeabc1ff07fed102e90f9037aed749b16abd9a6 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Mon, 24 Apr 2023 18:20:20 -0400 Subject: t/helper/test-oidmap.c: avoid using `strtok()` Apply similar treatment as in the previous commit to remove usage of `strtok()` from the "oidmap" test helper. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- t/helper/test-oidmap.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 't') diff --git a/t/helper/test-oidmap.c b/t/helper/test-oidmap.c index a7b7b38df1..5ce9eb3334 100644 --- a/t/helper/test-oidmap.c +++ b/t/helper/test-oidmap.c @@ -4,6 +4,7 @@ #include "oidmap.h" #include "setup.h" #include "strbuf.h" +#include "string-list.h" /* key is an oid and value is a name (could be a refname for example) */ struct test_entry { @@ -25,6 +26,7 @@ struct test_entry { */ int cmd__oidmap(int argc UNUSED, const char **argv UNUSED) { + struct string_list parts = STRING_LIST_INIT_NODUP; struct strbuf line = STRBUF_INIT; struct oidmap map = OIDMAP_INIT; @@ -35,19 +37,24 @@ int cmd__oidmap(int argc UNUSED, const char **argv UNUSED) /* process commands from stdin */ while (strbuf_getline(&line, stdin) != EOF) { - char *cmd, *p1 = NULL, *p2 = NULL; + char *cmd, *p1, *p2; struct test_entry *entry; struct object_id oid; /* break line into command and up to two parameters */ - cmd = strtok(line.buf, DELIM); + string_list_setlen(&parts, 0); + string_list_split_in_place(&parts, line.buf, DELIM, 2); + string_list_remove_empty_items(&parts, 0); + /* ignore empty lines */ - if (!cmd || *cmd == '#') + if (!parts.nr) + continue; + if (!*parts.items[0].string || *parts.items[0].string == '#') continue; - p1 = strtok(NULL, DELIM); - if (p1) - p2 = strtok(NULL, DELIM); + cmd = parts.items[0].string; + p1 = parts.nr >= 1 ? parts.items[1].string : NULL; + p2 = parts.nr >= 2 ? parts.items[2].string : NULL; if (!strcmp("put", cmd) && p1 && p2) { @@ -108,6 +115,7 @@ int cmd__oidmap(int argc UNUSED, const char **argv UNUSED) } } + string_list_clear(&parts, 0); strbuf_release(&line); oidmap_free(&map, 1); return 0; -- cgit v1.3 From a2742f8c59dae6ef55895933e0950d61b6d03720 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Mon, 24 Apr 2023 18:20:23 -0400 Subject: t/helper/test-json-writer.c: avoid using `strtok()` Apply similar treatment as in the previous commit to remove usage of `strtok()` from the "oidmap" test helper. Each of the different commands that the "json-writer" helper accepts pops the next space-delimited token from the current line and interprets it as a string, integer, or double (with the exception of the very first token, which is the command itself). To accommodate this, split the line in place by the space character, and pass the corresponding string_list to each of the specialized `get_s()`, `get_i()`, and `get_d()` functions. `get_i()` and `get_d()` are thin wrappers around `get_s()` that convert their result into the appropriate type by either calling `strtol()` or `strtod()`, respectively. In `get_s()`, we mark the token as "consumed" by incrementing the `consumed_nr` counter, indicating how many tokens we have read up to that point. Because each of these functions needs the string-list parts, the number of tokens consumed, and the line number, these three are wrapped up in to a struct representing the line state. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- t/helper/test-json-writer.c | 76 ++++++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 28 deletions(-) (limited to 't') diff --git a/t/helper/test-json-writer.c b/t/helper/test-json-writer.c index 86887f5320..afe393f597 100644 --- a/t/helper/test-json-writer.c +++ b/t/helper/test-json-writer.c @@ -1,5 +1,6 @@ #include "test-tool.h" #include "json-writer.h" +#include "string-list.h" static const char *expect_obj1 = "{\"a\":\"abc\",\"b\":42,\"c\":true}"; static const char *expect_obj2 = "{\"a\":-1,\"b\":2147483647,\"c\":0}"; @@ -394,35 +395,41 @@ static int unit_tests(void) return 0; } -static void get_s(int line_nr, char **s_in) +struct line { + struct string_list *parts; + size_t consumed_nr; + int nr; +}; + +static void get_s(struct line *line, char **s_in) { - *s_in = strtok(NULL, " "); - if (!*s_in) - die("line[%d]: expected: ", line_nr); + if (line->consumed_nr > line->parts->nr) + die("line[%d]: expected: ", line->nr); + *s_in = line->parts->items[line->consumed_nr++].string; } -static void get_i(int line_nr, intmax_t *s_in) +static void get_i(struct line *line, intmax_t *s_in) { char *s; char *endptr; - get_s(line_nr, &s); + get_s(line, &s); *s_in = strtol(s, &endptr, 10); if (*endptr || errno == ERANGE) - die("line[%d]: invalid integer value", line_nr); + die("line[%d]: invalid integer value", line->nr); } -static void get_d(int line_nr, double *s_in) +static void get_d(struct line *line, double *s_in) { char *s; char *endptr; - get_s(line_nr, &s); + get_s(line, &s); *s_in = strtod(s, &endptr); if (*endptr || errno == ERANGE) - die("line[%d]: invalid float value", line_nr); + die("line[%d]: invalid float value", line->nr); } static int pretty; @@ -453,6 +460,7 @@ static char *get_trimmed_line(char *buf, int buf_size) static int scripted(void) { + struct string_list parts = STRING_LIST_INIT_NODUP; struct json_writer jw = JSON_WRITER_INIT; char buf[MAX_LINE_LENGTH]; char *line; @@ -470,66 +478,77 @@ static int scripted(void) die("expected first line to be 'object' or 'array'"); while ((line = get_trimmed_line(buf, MAX_LINE_LENGTH)) != NULL) { + struct line state = { 0 }; char *verb; char *key; char *s_value; intmax_t i_value; double d_value; - line_nr++; + state.parts = &parts; + state.nr = ++line_nr; + + /* break line into command and zero or more tokens */ + string_list_setlen(&parts, 0); + string_list_split_in_place(&parts, line, " ", -1); + string_list_remove_empty_items(&parts, 0); + + /* ignore empty lines */ + if (!parts.nr || !*parts.items[0].string) + continue; - verb = strtok(line, " "); + verb = parts.items[state.consumed_nr++].string; if (!strcmp(verb, "end")) { jw_end(&jw); } else if (!strcmp(verb, "object-string")) { - get_s(line_nr, &key); - get_s(line_nr, &s_value); + get_s(&state, &key); + get_s(&state, &s_value); jw_object_string(&jw, key, s_value); } else if (!strcmp(verb, "object-int")) { - get_s(line_nr, &key); - get_i(line_nr, &i_value); + get_s(&state, &key); + get_i(&state, &i_value); jw_object_intmax(&jw, key, i_value); } else if (!strcmp(verb, "object-double")) { - get_s(line_nr, &key); - get_i(line_nr, &i_value); - get_d(line_nr, &d_value); + get_s(&state, &key); + get_i(&state, &i_value); + get_d(&state, &d_value); jw_object_double(&jw, key, i_value, d_value); } else if (!strcmp(verb, "object-true")) { - get_s(line_nr, &key); + get_s(&state, &key); jw_object_true(&jw, key); } else if (!strcmp(verb, "object-false")) { - get_s(line_nr, &key); + get_s(&state, &key); jw_object_false(&jw, key); } else if (!strcmp(verb, "object-null")) { - get_s(line_nr, &key); + get_s(&state, &key); jw_object_null(&jw, key); } else if (!strcmp(verb, "object-object")) { - get_s(line_nr, &key); + get_s(&state, &key); jw_object_inline_begin_object(&jw, key); } else if (!strcmp(verb, "object-array")) { - get_s(line_nr, &key); + get_s(&state, &key); jw_object_inline_begin_array(&jw, key); } else if (!strcmp(verb, "array-string")) { - get_s(line_nr, &s_value); + get_s(&state, &s_value); jw_array_string(&jw, s_value); } else if (!strcmp(verb, "array-int")) { - get_i(line_nr, &i_value); + get_i(&state, &i_value); jw_array_intmax(&jw, i_value); } else if (!strcmp(verb, "array-double")) { - get_i(line_nr, &i_value); - get_d(line_nr, &d_value); + get_i(&state, &i_value); + get_d(&state, &d_value); jw_array_double(&jw, i_value, d_value); } else if (!strcmp(verb, "array-true")) @@ -552,6 +571,7 @@ static int scripted(void) printf("%s\n", jw.json.buf); jw_release(&jw); + string_list_clear(&parts, 0); return 0; } -- cgit v1.3