From 350b87cd658553598a269fdd320ca05ee4789a10 Mon Sep 17 00:00:00 2001 From: Johannes Sixt Date: Fri, 8 Oct 2021 19:09:55 +0000 Subject: userdiff-cpp: tighten word regex Generally, word regex can be written such that they match tokens liberally and need not model the actual syntax because it can be assumed that the regex will only be applied to syntactically correct text. The regex for cpp (C/C++) is too liberal, though. It regards these sequences as single tokens: 1+2 1.5-e+2+f and the following amalgams as one token: .l as in str.length .f as in str.find .e as in str.erase Tighten the regex in the following way: - Accept + and - only in one position in the exponent. + and - are no longer regarded as the sign of a number and are treated by the catcher-all that is not visible in the driver's regex. - Accept a leading decimal point only when it is followed by a digit. For readability, factor hex- and binary numbers into an own term. As a drive-by, this fixes that floating point numbers such as 12E5 (with upper-case E) were split into two tokens. Signed-off-by: Johannes Sixt Signed-off-by: Junio C Hamano --- userdiff.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'userdiff.c') diff --git a/userdiff.c b/userdiff.c index af02b1878c..8b49194f56 100644 --- a/userdiff.c +++ b/userdiff.c @@ -64,8 +64,14 @@ PATTERNS("cpp", /* functions/methods, variables, and compounds at top level */ "^((::[[:space:]]*)?[A-Za-z_].*)$", /* -- */ + /* identifiers and keywords */ "[a-zA-Z_][a-zA-Z0-9_]*" - "|[-+0-9.e]+[fFlL]?|0[xXbB]?[0-9a-fA-F]+[lLuU]*" + /* decimal and octal integers as well as floatingpoint numbers */ + "|[0-9][0-9.]*([Ee][-+]?[0-9]+)?[fFlLuU]*" + /* hexadecimal and binary integers */ + "|0[xXbB][0-9a-fA-F]+[lLuU]*" + /* floatingpoint numbers that begin with a decimal point */ + "|\\.[0-9]+([Ee][-+]?[0-9]+)?[fFlL]?" "|[-+*/<>%&^|=!]=|--|\\+\\+|<<=?|>>=?|&&|\\|\\||::|->\\*?|\\.\\*"), PATTERNS("csharp", /* Keywords */ -- cgit v1.3 From 637b80cd6a2a73eb6723aec2f52aed1135d99de4 Mon Sep 17 00:00:00 2001 From: Johannes Sixt Date: Sun, 10 Oct 2021 17:03:03 +0000 Subject: userdiff-cpp: permit the digit-separating single-quote in numbers Since C++17, the single-quote can be used as digit separator: 3.141'592'654 1'000'000 0xdead'beaf Make it known to the word regex of the cpp driver, so that numbers are not split into separate tokens at the single-quotes. Signed-off-by: Johannes Sixt Signed-off-by: Junio C Hamano --- t/t4034/cpp/expect | 8 ++++---- userdiff.c | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'userdiff.c') diff --git a/t/t4034/cpp/expect b/t/t4034/cpp/expect index 3d37ddac42..b90b3f207b 100644 --- a/t/t4034/cpp/expect +++ b/t/t4034/cpp/expect @@ -7,15 +7,15 @@ Foo() : x(0&&1&42) { foo0bar cout<<"Hello World!?\n"<(1 -+1e10 0xabcdef) 'x.' // long double -3.141'592'653e-10l654e+10l +3.141'592'653e-10l3.141'592'654e+10l // float 120E5f120E6f // hex -0xdead'beafBeaf+8ULL7ULL +0xdead'beaf0xdead'Beaf+8ULL7ULL // octal -0123'45674560 +0123'45670123'4560 // binary -0b100b11'00+e1 +0b10'000b11'00+e1 // expression 1.5-e+23+f // another one diff --git a/userdiff.c b/userdiff.c index 8b49194f56..c1084650dd 100644 --- a/userdiff.c +++ b/userdiff.c @@ -67,11 +67,11 @@ PATTERNS("cpp", /* identifiers and keywords */ "[a-zA-Z_][a-zA-Z0-9_]*" /* decimal and octal integers as well as floatingpoint numbers */ - "|[0-9][0-9.]*([Ee][-+]?[0-9]+)?[fFlLuU]*" + "|[0-9][0-9.']*([Ee][-+]?[0-9]+)?[fFlLuU]*" /* hexadecimal and binary integers */ - "|0[xXbB][0-9a-fA-F]+[lLuU]*" + "|0[xXbB][0-9a-fA-F']+[lLuU]*" /* floatingpoint numbers that begin with a decimal point */ - "|\\.[0-9]+([Ee][-+]?[0-9]+)?[fFlL]?" + "|\\.[0-9][0-9']*([Ee][-+]?[0-9]+)?[fFlL]?" "|[-+*/<>%&^|=!]=|--|\\+\\+|<<=?|>>=?|&&|\\|\\||::|->\\*?|\\.\\*"), PATTERNS("csharp", /* Keywords */ -- cgit v1.3 From c4fdba338355d80e40b84391af9f8c022d4f21af Mon Sep 17 00:00:00 2001 From: Johannes Sixt Date: Sun, 10 Oct 2021 17:03:04 +0000 Subject: userdiff-cpp: learn the C++ spaceship operator Since C++20, the language has a generalized comparison operator <=>. Teach the cpp driver not to separate it into <= and > tokens. Signed-off-by: Johannes Sixt Signed-off-by: Junio C Hamano --- t/t4034/cpp/expect | 2 +- userdiff.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'userdiff.c') diff --git a/t/t4034/cpp/expect b/t/t4034/cpp/expect index b90b3f207b..5ff4ce477b 100644 --- a/t/t4034/cpp/expect +++ b/t/t4034/cpp/expect @@ -25,7 +25,7 @@ str.e+6575 a**=b c//=d e%%=f a+++b c---d a<<<<=b c>>>>=d -a<<=b c<=<d e>>=f g>=>h i<=>j +a<<=b c<=<d e>>=f g>=>h i<=<=>j a==!=b c!==d a^^=b c||=d e&&&=f a|||b diff --git a/userdiff.c b/userdiff.c index c1084650dd..7b143ef36b 100644 --- a/userdiff.c +++ b/userdiff.c @@ -72,7 +72,7 @@ PATTERNS("cpp", "|0[xXbB][0-9a-fA-F']+[lLuU]*" /* floatingpoint numbers that begin with a decimal point */ "|\\.[0-9][0-9']*([Ee][-+]?[0-9]+)?[fFlL]?" - "|[-+*/<>%&^|=!]=|--|\\+\\+|<<=?|>>=?|&&|\\|\\||::|->\\*?|\\.\\*"), + "|[-+*/<>%&^|=!]=|--|\\+\\+|<<=?|>>=?|&&|\\|\\||::|->\\*?|\\.\\*|<=>"), PATTERNS("csharp", /* Keywords */ "!^[ \t]*(do|while|for|if|else|instanceof|new|return|switch|case|throw|catch|using)\n" -- cgit v1.3 From 386076ec92c702104cb15bc23e4521dac10c7c2d Mon Sep 17 00:00:00 2001 From: Johannes Sixt Date: Sun, 24 Oct 2021 11:56:43 +0200 Subject: userdiff-cpp: back out the digit-separators in numbers The implementation of digit-separating single-quotes introduced a note-worthy regression: the change of a character literal with a digit would splice the digit and the closing single-quote. For example, the change from 'a' to '2' is now tokenized as '[-a'-]{+2'+} instead of '[-a-]{+2+}'. The options to fix the regression are: - Tighten the regular expression such that the single-quote can only occur between digits (that would match the official syntax). - Remove support for digit separators. I chose to remove support, because - I have not seen a lot of code make use of digit separators. - If code does use digit separators, then the numbers are typically long. If a change in one of the segments occurs, it is actually better visible if only that segment is highlighted as the word that changed instead of the whole long number. This choice does introduce another minor regression, though, which is highlighted in the test case: when a change occurs in the second or later segment of a hexadecimal number where the segment begins with a digit, but also has letters, the segment is mistaken as consisting of a number and an identifier. I can live with that. Signed-off-by: Johannes Sixt Signed-off-by: Junio C Hamano --- t/t4034/cpp/expect | 12 ++++++------ t/t4034/cpp/post | 10 +++++----- t/t4034/cpp/pre | 8 ++++---- userdiff.c | 6 +++--- 4 files changed, 18 insertions(+), 18 deletions(-) (limited to 'userdiff.c') diff --git a/t/t4034/cpp/expect b/t/t4034/cpp/expect index 5ff4ce477b..dc500ae092 100644 --- a/t/t4034/cpp/expect +++ b/t/t4034/cpp/expect @@ -1,21 +1,21 @@ diff --git a/pre b/post -index 144cd98..64e78af 100644 +index a1a09b7..f1b6f3c 100644 --- a/pre +++ b/post @@ -1,30 +1,30 @@ Foo() : x(0&&1&42) { foo0bar(x.findFind); } cout<<"Hello World!?\n"<(1 -+1e10 0xabcdef) 'x.' +(1 -+1e10 0xabcdef) 'x2' // long double -3.141'592'653e-10l3.141'592'654e+10l +3.141592653e-10l3.141592654e+10l // float 120E5f120E6f // hex -0xdead'beaf0xdead'Beaf+8ULL7ULL +0xdead0xdeaf'1eaFeaf+8ULL7ULL // octal -0123'45670123'4560 +0123456701234560 // binary -0b10'000b11'00+e1 +0b10000b1100+e1 // expression 1.5-e+23+f // another one diff --git a/t/t4034/cpp/post b/t/t4034/cpp/post index 64e78afbfb..f1b6f3c228 100644 --- a/t/t4034/cpp/post +++ b/t/t4034/cpp/post @@ -1,16 +1,16 @@ Foo() : x(0&42) { bar(x.Find); } cout<<"Hello World?\n"<%&^|=!]=|--|\\+\\+|<<=?|>>=?|&&|\\|\\||::|->\\*?|\\.\\*|<=>"), PATTERNS("csharp", /* Keywords */ -- cgit v1.3