diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/regexp/syntax/parse.go | 16 | ||||
| -rw-r--r-- | src/regexp/syntax/parse_test.go | 36 | ||||
| -rw-r--r-- | src/regexp/syntax/regexp.go | 217 | ||||
| -rw-r--r-- | src/regexp/syntax/simplify_test.go | 16 |
4 files changed, 248 insertions, 37 deletions
diff --git a/src/regexp/syntax/parse.go b/src/regexp/syntax/parse.go index a4ccfe3bdb..6b360b8700 100644 --- a/src/regexp/syntax/parse.go +++ b/src/regexp/syntax/parse.go @@ -1863,6 +1863,22 @@ func cleanClass(rp *[]rune) []rune { return r[:w] } +// inCharClass reports whether r is in the class. +// It assumes the class has been cleaned by cleanClass. +func inCharClass(r rune, class []rune) bool { + _, ok := sort.Find(len(class)/2, func(i int) int { + lo, hi := class[2*i], class[2*i+1] + if r > hi { + return +1 + } + if r < lo { + return -1 + } + return 0 + }) + return ok +} + // appendLiteral returns the result of appending the literal x to the class r. func appendLiteral(r []rune, x rune, flags Flags) []rune { if flags&FoldCase != 0 { diff --git a/src/regexp/syntax/parse_test.go b/src/regexp/syntax/parse_test.go index d7999046e0..0f885bd5c8 100644 --- a/src/regexp/syntax/parse_test.go +++ b/src/regexp/syntax/parse_test.go @@ -590,3 +590,39 @@ func TestToStringEquivalentParse(t *testing.T) { } } } + +var stringTests = []struct { + re string + out string +}{ + {`x(?i:ab*c|d?e)1`, `x(?i:AB*C|D?E)1`}, + {`x(?i:ab*cd?e)1`, `x(?i:AB*CD?E)1`}, + {`0(?i:ab*c|d?e)1`, `(?i:0(?:AB*C|D?E)1)`}, + {`0(?i:ab*cd?e)1`, `(?i:0AB*CD?E1)`}, + {`x(?i:ab*c|d?e)`, `x(?i:AB*C|D?E)`}, + {`x(?i:ab*cd?e)`, `x(?i:AB*CD?E)`}, + {`0(?i:ab*c|d?e)`, `(?i:0(?:AB*C|D?E))`}, + {`0(?i:ab*cd?e)`, `(?i:0AB*CD?E)`}, + {`(?i:ab*c|d?e)1`, `(?i:(?:AB*C|D?E)1)`}, + {`(?i:ab*cd?e)1`, `(?i:AB*CD?E1)`}, + {`(?i:ab)[123](?i:cd)`, `(?i:AB[1-3]CD)`}, + {`(?i:ab*c|d?e)`, `(?i:AB*C|D?E)`}, + {`[Aa][Bb]`, `(?i:AB)`}, + {`[Aa][Bb]*[Cc]`, `(?i:AB*C)`}, + {`A(?:[Bb][Cc]|[Dd])[Zz]`, `A(?i:(?:BC|D)Z)`}, + {`[Aa](?:[Bb][Cc]|[Dd])Z`, `(?i:A(?:BC|D))Z`}, +} + +func TestString(t *testing.T) { + for _, tt := range stringTests { + re, err := Parse(tt.re, Perl) + if err != nil { + t.Errorf("Parse(%#q): %v", tt.re, err) + continue + } + out := re.String() + if out != tt.out { + t.Errorf("Parse(%#q).String() = %#q, want %#q", tt.re, out, tt.out) + } + } +} diff --git a/src/regexp/syntax/regexp.go b/src/regexp/syntax/regexp.go index 3a4d2d201c..4fa7d0e2f8 100644 --- a/src/regexp/syntax/regexp.go +++ b/src/regexp/syntax/regexp.go @@ -112,8 +112,165 @@ func (x *Regexp) Equal(y *Regexp) bool { return true } +// printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp. +type printFlags uint8 + +const ( + flagI printFlags = 1 << iota // (?i: + flagM // (?m: + flagS // (?s: + flagOff // ) + flagPrec // (?: ) + negShift = 5 // flagI<<negShift is (?-i: +) + +// addSpan enables the flags f around start..last, +// by setting flags[start] = f and flags[last] = flagOff. +func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) { + if *flags == nil { + *flags = make(map[*Regexp]printFlags) + } + (*flags)[start] = f + (*flags)[last] |= flagOff // maybe start==last +} + +// calcFlags calculates the flags to print around each subexpression in re, +// storing that information in (*flags)[sub] for each affected subexpression. +// The first time an entry needs to be written to *flags, calcFlags allocates the map. +// calcFlags also calculates the flags that must be active or can't be active +// around re and returns those flags. +func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) { + switch re.Op { + default: + return 0, 0 + + case OpLiteral: + // If literal is fold-sensitive, return (flagI, 0) or (0, flagI) + // according to whether (?i) is active. + // If literal is not fold-sensitive, return 0, 0. + for _, r := range re.Rune { + if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r { + if re.Flags&FoldCase != 0 { + return flagI, 0 + } else { + return 0, flagI + } + } + } + return 0, 0 + + case OpCharClass: + // If literal is fold-sensitive, return 0, flagI - (?i) has been compiled out. + // If literal is not fold-sensitive, return 0, 0. + for i := 0; i < len(re.Rune); i += 2 { + lo := max(minFold, re.Rune[i]) + hi := min(maxFold, re.Rune[i+1]) + for r := lo; r <= hi; r++ { + for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) { + if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) { + return 0, flagI + } + } + } + } + return 0, 0 + + case OpAnyCharNotNL: // (?-s). + return 0, flagS + + case OpAnyChar: // (?s). + return flagS, 0 + + case OpBeginLine, OpEndLine: // (?m)^ (?m)$ + return flagM, 0 + + case OpEndText: + if re.Flags&WasDollar != 0 { // (?-m)$ + return 0, flagM + } + return 0, 0 + + case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat: + return calcFlags(re.Sub[0], flags) + + case OpConcat, OpAlternate: + // Gather the must and cant for each subexpression. + // When we find a conflicting subexpression, insert the necessary + // flags around the previously identified span and start over. + var must, cant, allCant printFlags + start := 0 + last := 0 + did := false + for i, sub := range re.Sub { + subMust, subCant := calcFlags(sub, flags) + if must&subCant != 0 || subMust&cant != 0 { + if must != 0 { + addSpan(re.Sub[start], re.Sub[last], must, flags) + } + must = 0 + cant = 0 + start = i + did = true + } + must |= subMust + cant |= subCant + allCant |= subCant + if subMust != 0 { + last = i + } + if must == 0 && start == i { + start++ + } + } + if !did { + // No conflicts: pass the accumulated must and cant upward. + return must, cant + } + if must != 0 { + // Conflicts found; need to finish final span. + addSpan(re.Sub[start], re.Sub[last], must, flags) + } + return 0, allCant + } +} + // writeRegexp writes the Perl syntax for the regular expression re to b. -func writeRegexp(b *strings.Builder, re *Regexp) { +func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) { + f |= flags[re] + if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 { + // flagPrec is redundant with other flags being added and terminated + f &^= flagPrec + } + if f&^(flagOff|flagPrec) != 0 { + b.WriteString(`(?`) + if f&flagI != 0 { + b.WriteString(`i`) + } + if f&flagM != 0 { + b.WriteString(`m`) + } + if f&flagS != 0 { + b.WriteString(`s`) + } + if f&((flagM|flagS)<<negShift) != 0 { + b.WriteString(`-`) + if f&(flagM<<negShift) != 0 { + b.WriteString(`m`) + } + if f&(flagS<<negShift) != 0 { + b.WriteString(`s`) + } + } + b.WriteString(`:`) + } + if f&flagOff != 0 { + defer b.WriteString(`)`) + } + if f&flagPrec != 0 { + b.WriteString(`(?:`) + defer b.WriteString(`)`) + } + switch re.Op { default: b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">") @@ -122,15 +279,9 @@ func writeRegexp(b *strings.Builder, re *Regexp) { case OpEmptyMatch: b.WriteString(`(?:)`) case OpLiteral: - if re.Flags&FoldCase != 0 { - b.WriteString(`(?i:`) - } for _, r := range re.Rune { escape(b, r, false) } - if re.Flags&FoldCase != 0 { - b.WriteString(`)`) - } case OpCharClass: if len(re.Rune)%2 != 0 { b.WriteString(`[invalid char class]`) @@ -147,7 +298,9 @@ func writeRegexp(b *strings.Builder, re *Regexp) { lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 escape(b, lo, lo == '-') if lo != hi { - b.WriteRune('-') + if hi != lo+1 { + b.WriteRune('-') + } escape(b, hi, hi == '-') } } @@ -156,25 +309,25 @@ func writeRegexp(b *strings.Builder, re *Regexp) { lo, hi := re.Rune[i], re.Rune[i+1] escape(b, lo, lo == '-') if lo != hi { - b.WriteRune('-') + if hi != lo+1 { + b.WriteRune('-') + } escape(b, hi, hi == '-') } } } b.WriteRune(']') - case OpAnyCharNotNL: - b.WriteString(`(?-s:.)`) - case OpAnyChar: - b.WriteString(`(?s:.)`) + case OpAnyCharNotNL, OpAnyChar: + b.WriteString(`.`) case OpBeginLine: - b.WriteString(`(?m:^)`) + b.WriteString(`^`) case OpEndLine: - b.WriteString(`(?m:$)`) + b.WriteString(`$`) case OpBeginText: b.WriteString(`\A`) case OpEndText: if re.Flags&WasDollar != 0 { - b.WriteString(`(?-m:$)`) + b.WriteString(`$`) } else { b.WriteString(`\z`) } @@ -191,17 +344,17 @@ func writeRegexp(b *strings.Builder, re *Regexp) { b.WriteRune('(') } if re.Sub[0].Op != OpEmptyMatch { - writeRegexp(b, re.Sub[0]) + writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags) } b.WriteRune(')') case OpStar, OpPlus, OpQuest, OpRepeat: - if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { - b.WriteString(`(?:`) - writeRegexp(b, sub) - b.WriteString(`)`) - } else { - writeRegexp(b, sub) + p := printFlags(0) + sub := re.Sub[0] + if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { + p = flagPrec } + writeRegexp(b, sub, p, flags) + switch re.Op { case OpStar: b.WriteRune('*') @@ -225,27 +378,31 @@ func writeRegexp(b *strings.Builder, re *Regexp) { } case OpConcat: for _, sub := range re.Sub { + p := printFlags(0) if sub.Op == OpAlternate { - b.WriteString(`(?:`) - writeRegexp(b, sub) - b.WriteString(`)`) - } else { - writeRegexp(b, sub) + p = flagPrec } + writeRegexp(b, sub, p, flags) } case OpAlternate: for i, sub := range re.Sub { if i > 0 { b.WriteRune('|') } - writeRegexp(b, sub) + writeRegexp(b, sub, 0, flags) } } } func (re *Regexp) String() string { var b strings.Builder - writeRegexp(&b, re) + var flags map[*Regexp]printFlags + must, cant := calcFlags(re, &flags) + must |= (cant &^ flagI) << negShift + if must != 0 { + must |= flagOff + } + writeRegexp(&b, re, must, flags) return b.String() } diff --git a/src/regexp/syntax/simplify_test.go b/src/regexp/syntax/simplify_test.go index 9877db3d0a..6d06f99c1b 100644 --- a/src/regexp/syntax/simplify_test.go +++ b/src/regexp/syntax/simplify_test.go @@ -13,7 +13,7 @@ var simplifyTests = []struct { // Already-simple constructs {`a`, `a`}, {`ab`, `ab`}, - {`a|b`, `[a-b]`}, + {`a|b`, `[ab]`}, {`ab|cd`, `ab|cd`}, {`(ab)*`, `(ab)*`}, {`(ab)+`, `(ab)+`}, @@ -40,16 +40,16 @@ var simplifyTests = []struct { // Perl character classes {`\d`, `[0-9]`}, - {`\s`, `[\t-\n\f-\r ]`}, + {`\s`, `[\t\n\f\r ]`}, {`\w`, `[0-9A-Z_a-z]`}, {`\D`, `[^0-9]`}, - {`\S`, `[^\t-\n\f-\r ]`}, + {`\S`, `[^\t\n\f\r ]`}, {`\W`, `[^0-9A-Z_a-z]`}, {`[\d]`, `[0-9]`}, - {`[\s]`, `[\t-\n\f-\r ]`}, + {`[\s]`, `[\t\n\f\r ]`}, {`[\w]`, `[0-9A-Z_a-z]`}, {`[\D]`, `[^0-9]`}, - {`[\S]`, `[^\t-\n\f-\r ]`}, + {`[\S]`, `[^\t\n\f\r ]`}, {`[\W]`, `[^0-9A-Z_a-z]`}, // Posix repetitions @@ -82,7 +82,8 @@ var simplifyTests = []struct { {`a{0}`, `(?:)`}, // Character class simplification - {`[ab]`, `[a-b]`}, + {`[ab]`, `[ab]`}, + {`[abc]`, `[a-c]`}, {`[a-za-za-z]`, `[a-z]`}, {`[A-Za-zA-Za-z]`, `[A-Za-z]`}, {`[ABCDEFGH]`, `[A-H]`}, @@ -120,7 +121,8 @@ var simplifyTests = []struct { // interesting than they might otherwise be. String inserts // explicit (?:) in place of non-parenthesized empty strings, // to make them easier to spot for other parsers. - {`(a|b|)`, `([a-b]|(?:))`}, + {`(a|b|c|)`, `([a-c]|(?:))`}, + {`(a|b|)`, `([ab]|(?:))`}, {`(|)`, `()`}, {`a()`, `a()`}, {`(()|())`, `(()|())`}, |
