4 files changed, 248 insertions, 37 deletions
diff --git a/src/regexp/syntax/parse.go b/src/regexp/syntax/parse.go
index a4ccfe3bdb..6b360b8700 100644
--- a/src/regexp/syntax/parse.go
+++ b/src/regexp/syntax/parse.go
@@ -1863,6 +1863,22 @@ func cleanClass(rp *[]rune) []rune {
 	return r[:w]
 }
 
+// inCharClass reports whether r is in the class.
+// It assumes the class has been cleaned by cleanClass.
+func inCharClass(r rune, class []rune) bool {
+	_, ok := sort.Find(len(class)/2, func(i int) int {
+		lo, hi := class[2*i], class[2*i+1]
+		if r > hi {
+			return +1
+		}
+		if r < lo {
+			return -1
+		}
+		return 0
+	})
+	return ok
+}
+
 // appendLiteral returns the result of appending the literal x to the class r.
 func appendLiteral(r []rune, x rune, flags Flags) []rune {
 	if flags&FoldCase != 0 {
diff --git a/src/regexp/syntax/parse_test.go b/src/regexp/syntax/parse_test.go
index d7999046e0..0f885bd5c8 100644
--- a/src/regexp/syntax/parse_test.go
+++ b/src/regexp/syntax/parse_test.go
@@ -590,3 +590,39 @@ func TestToStringEquivalentParse(t *testing.T) {
 		}
 	}
 }
+
+var stringTests = []struct {
+	re  string
+	out string
+}{
+	{`x(?i:ab*c|d?e)1`, `x(?i:AB*C|D?E)1`},
+	{`x(?i:ab*cd?e)1`, `x(?i:AB*CD?E)1`},
+	{`0(?i:ab*c|d?e)1`, `(?i:0(?:AB*C|D?E)1)`},
+	{`0(?i:ab*cd?e)1`, `(?i:0AB*CD?E1)`},
+	{`x(?i:ab*c|d?e)`, `x(?i:AB*C|D?E)`},
+	{`x(?i:ab*cd?e)`, `x(?i:AB*CD?E)`},
+	{`0(?i:ab*c|d?e)`, `(?i:0(?:AB*C|D?E))`},
+	{`0(?i:ab*cd?e)`, `(?i:0AB*CD?E)`},
+	{`(?i:ab*c|d?e)1`, `(?i:(?:AB*C|D?E)1)`},
+	{`(?i:ab*cd?e)1`, `(?i:AB*CD?E1)`},
+	{`(?i:ab)[123](?i:cd)`, `(?i:AB[1-3]CD)`},
+	{`(?i:ab*c|d?e)`, `(?i:AB*C|D?E)`},
+	{`[Aa][Bb]`, `(?i:AB)`},
+	{`[Aa][Bb]*[Cc]`, `(?i:AB*C)`},
+	{`A(?:[Bb][Cc]|[Dd])[Zz]`, `A(?i:(?:BC|D)Z)`},
+	{`[Aa](?:[Bb][Cc]|[Dd])Z`, `(?i:A(?:BC|D))Z`},
+}
+
+func TestString(t *testing.T) {
+	for _, tt := range stringTests {
+		re, err := Parse(tt.re, Perl)
+		if err != nil {
+			t.Errorf("Parse(%#q): %v", tt.re, err)
+			continue
+		}
+		out := re.String()
+		if out != tt.out {
+			t.Errorf("Parse(%#q).String() = %#q, want %#q", tt.re, out, tt.out)
+		}
+	}
+}
diff --git a/src/regexp/syntax/regexp.go b/src/regexp/syntax/regexp.go
index 3a4d2d201c..4fa7d0e2f8 100644
--- a/src/regexp/syntax/regexp.go
+++ b/src/regexp/syntax/regexp.go
@@ -112,8 +112,165 @@ func (x *Regexp) Equal(y *Regexp) bool {
 	return true
 }
 
+// printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp.
+type printFlags uint8
+
+const (
+	flagI    printFlags = 1 << iota // (?i:
+	flagM                           // (?m:
+	flagS                           // (?s:
+	flagOff                         // )
+	flagPrec                        // (?: )
+	negShift = 5                    // flagI<<negShift is (?-i:
+)
+
+// addSpan enables the flags f around start..last,
+// by setting flags[start] = f and flags[last] = flagOff.
+func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) {
+	if *flags == nil {
+		*flags = make(map[*Regexp]printFlags)
+	}
+	(*flags)[start] = f
+	(*flags)[last] |= flagOff // maybe start==last
+}
+
+// calcFlags calculates the flags to print around each subexpression in re,
+// storing that information in (*flags)[sub] for each affected subexpression.
+// The first time an entry needs to be written to *flags, calcFlags allocates the map.
+// calcFlags also calculates the flags that must be active or can't be active
+// around re and returns those flags.
+func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) {
+	switch re.Op {
+	default:
+		return 0, 0
+
+	case OpLiteral:
+		// If literal is fold-sensitive, return (flagI, 0) or (0, flagI)
+		// according to whether (?i) is active.
+		// If literal is not fold-sensitive, return 0, 0.
+		for _, r := range re.Rune {
+			if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r {
+				if re.Flags&FoldCase != 0 {
+					return flagI, 0
+				} else {
+					return 0, flagI
+				}
+			}
+		}
+		return 0, 0
+
+	case OpCharClass:
+		// If literal is fold-sensitive, return 0, flagI - (?i) has been compiled out.
+		// If literal is not fold-sensitive, return 0, 0.
+		for i := 0; i < len(re.Rune); i += 2 {
+			lo := max(minFold, re.Rune[i])
+			hi := min(maxFold, re.Rune[i+1])
+			for r := lo; r <= hi; r++ {
+				for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) {
+					if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) {
+						return 0, flagI
+					}
+				}
+			}
+		}
+		return 0, 0
+
+	case OpAnyCharNotNL: // (?-s).
+		return 0, flagS
+
+	case OpAnyChar: // (?s).
+		return flagS, 0
+
+	case OpBeginLine, OpEndLine: // (?m)^ (?m)$
+		return flagM, 0
+
+	case OpEndText:
+		if re.Flags&WasDollar != 0 { // (?-m)$
+			return 0, flagM
+		}
+		return 0, 0
+
+	case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat:
+		return calcFlags(re.Sub[0], flags)
+
+	case OpConcat, OpAlternate:
+		// Gather the must and cant for each subexpression.
+		// When we find a conflicting subexpression, insert the necessary
+		// flags around the previously identified span and start over.
+		var must, cant, allCant printFlags
+		start := 0
+		last := 0
+		did := false
+		for i, sub := range re.Sub {
+			subMust, subCant := calcFlags(sub, flags)
+			if must&subCant != 0 || subMust&cant != 0 {
+				if must != 0 {
+					addSpan(re.Sub[start], re.Sub[last], must, flags)
+				}
+				must = 0
+				cant = 0
+				start = i
+				did = true
+			}
+			must |= subMust
+			cant |= subCant
+			allCant |= subCant
+			if subMust != 0 {
+				last = i
+			}
+			if must == 0 && start == i {
+				start++
+			}
+		}
+		if !did {
+			// No conflicts: pass the accumulated must and cant upward.
+			return must, cant
+		}
+		if must != 0 {
+			// Conflicts found; need to finish final span.
+			addSpan(re.Sub[start], re.Sub[last], must, flags)
+		}
+		return 0, allCant
+	}
+}
+
 // writeRegexp writes the Perl syntax for the regular expression re to b.
-func writeRegexp(b *strings.Builder, re *Regexp) {
+func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) {
+	f |= flags[re]
+	if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 {
+		// flagPrec is redundant with other flags being added and terminated
+		f &^= flagPrec
+	}
+	if f&^(flagOff|flagPrec) != 0 {
+		b.WriteString(`(?`)
+		if f&flagI != 0 {
+			b.WriteString(`i`)
+		}
+		if f&flagM != 0 {
+			b.WriteString(`m`)
+		}
+		if f&flagS != 0 {
+			b.WriteString(`s`)
+		}
+		if f&((flagM|flagS)<<negShift) != 0 {
+			b.WriteString(`-`)
+			if f&(flagM<<negShift) != 0 {
+				b.WriteString(`m`)
+			}
+			if f&(flagS<<negShift) != 0 {
+				b.WriteString(`s`)
+			}
+		}
+		b.WriteString(`:`)
+	}
+	if f&flagOff != 0 {
+		defer b.WriteString(`)`)
+	}
+	if f&flagPrec != 0 {
+		b.WriteString(`(?:`)
+		defer b.WriteString(`)`)
+	}
+
 	switch re.Op {
 	default:
 		b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
@@ -122,15 +279,9 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
 	case OpEmptyMatch:
 		b.WriteString(`(?:)`)
 	case OpLiteral:
-		if re.Flags&FoldCase != 0 {
-			b.WriteString(`(?i:`)
-		}
 		for _, r := range re.Rune {
 			escape(b, r, false)
 		}
-		if re.Flags&FoldCase != 0 {
-			b.WriteString(`)`)
-		}
 	case OpCharClass:
 		if len(re.Rune)%2 != 0 {
 			b.WriteString(`[invalid char class]`)
@@ -147,7 +298,9 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
 				lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
 				escape(b, lo, lo == '-')
 				if lo != hi {
-					b.WriteRune('-')
+					if hi != lo+1 {
+						b.WriteRune('-')
+					}
 					escape(b, hi, hi == '-')
 				}
 			}
@@ -156,25 +309,25 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
 				lo, hi := re.Rune[i], re.Rune[i+1]
 				escape(b, lo, lo == '-')
 				if lo != hi {
-					b.WriteRune('-')
+					if hi != lo+1 {
+						b.WriteRune('-')
+					}
 					escape(b, hi, hi == '-')
 				}
 			}
 		}
 		b.WriteRune(']')
-	case OpAnyCharNotNL:
-		b.WriteString(`(?-s:.)`)
-	case OpAnyChar:
-		b.WriteString(`(?s:.)`)
+	case OpAnyCharNotNL, OpAnyChar:
+		b.WriteString(`.`)
 	case OpBeginLine:
-		b.WriteString(`(?m:^)`)
+		b.WriteString(`^`)
 	case OpEndLine:
-		b.WriteString(`(?m:$)`)
+		b.WriteString(`$`)
 	case OpBeginText:
 		b.WriteString(`\A`)
 	case OpEndText:
 		if re.Flags&WasDollar != 0 {
-			b.WriteString(`(?-m:$)`)
+			b.WriteString(`$`)
 		} else {
 			b.WriteString(`\z`)
 		}
@@ -191,17 +344,17 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
 			b.WriteRune('(')
 		}
 		if re.Sub[0].Op != OpEmptyMatch {
-			writeRegexp(b, re.Sub[0])
+			writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags)
 		}
 		b.WriteRune(')')
 	case OpStar, OpPlus, OpQuest, OpRepeat:
-		if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
-			b.WriteString(`(?:`)
-			writeRegexp(b, sub)
-			b.WriteString(`)`)
-		} else {
-			writeRegexp(b, sub)
+		p := printFlags(0)
+		sub := re.Sub[0]
+		if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
+			p = flagPrec
 		}
+		writeRegexp(b, sub, p, flags)
+
 		switch re.Op {
 		case OpStar:
 			b.WriteRune('*')
@@ -225,27 +378,31 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
 		}
 	case OpConcat:
 		for _, sub := range re.Sub {
+			p := printFlags(0)
 			if sub.Op == OpAlternate {
-				b.WriteString(`(?:`)
-				writeRegexp(b, sub)
-				b.WriteString(`)`)
-			} else {
-				writeRegexp(b, sub)
+				p = flagPrec
 			}
+			writeRegexp(b, sub, p, flags)
 		}
 	case OpAlternate:
 		for i, sub := range re.Sub {
 			if i > 0 {
 				b.WriteRune('|')
 			}
-			writeRegexp(b, sub)
+			writeRegexp(b, sub, 0, flags)
 		}
 	}
 }
 
 func (re *Regexp) String() string {
 	var b strings.Builder
-	writeRegexp(&b, re)
+	var flags map[*Regexp]printFlags
+	must, cant := calcFlags(re, &flags)
+	must |= (cant &^ flagI) << negShift
+	if must != 0 {
+		must |= flagOff
+	}
+	writeRegexp(&b, re, must, flags)
 	return b.String()
 }
 
diff --git a/src/regexp/syntax/simplify_test.go b/src/regexp/syntax/simplify_test.go
index 9877db3d0a..6d06f99c1b 100644
--- a/src/regexp/syntax/simplify_test.go
+++ b/src/regexp/syntax/simplify_test.go
@@ -13,7 +13,7 @@ var simplifyTests = []struct {
 	// Already-simple constructs
 	{`a`, `a`},
 	{`ab`, `ab`},
-	{`a|b`, `[a-b]`},
+	{`a|b`, `[ab]`},
 	{`ab|cd`, `ab|cd`},
 	{`(ab)*`, `(ab)*`},
 	{`(ab)+`, `(ab)+`},
@@ -40,16 +40,16 @@ var simplifyTests = []struct {
 
 	// Perl character classes
 	{`\d`, `[0-9]`},
-	{`\s`, `[\t-\n\f-\r ]`},
+	{`\s`, `[\t\n\f\r ]`},
 	{`\w`, `[0-9A-Z_a-z]`},
 	{`\D`, `[^0-9]`},
-	{`\S`, `[^\t-\n\f-\r ]`},
+	{`\S`, `[^\t\n\f\r ]`},
 	{`\W`, `[^0-9A-Z_a-z]`},
 	{`[\d]`, `[0-9]`},
-	{`[\s]`, `[\t-\n\f-\r ]`},
+	{`[\s]`, `[\t\n\f\r ]`},
 	{`[\w]`, `[0-9A-Z_a-z]`},
 	{`[\D]`, `[^0-9]`},
-	{`[\S]`, `[^\t-\n\f-\r ]`},
+	{`[\S]`, `[^\t\n\f\r ]`},
 	{`[\W]`, `[^0-9A-Z_a-z]`},
 
 	// Posix repetitions
@@ -82,7 +82,8 @@ var simplifyTests = []struct {
 	{`a{0}`, `(?:)`},
 
 	// Character class simplification
-	{`[ab]`, `[a-b]`},
+	{`[ab]`, `[ab]`},
+	{`[abc]`, `[a-c]`},
 	{`[a-za-za-z]`, `[a-z]`},
 	{`[A-Za-zA-Za-z]`, `[A-Za-z]`},
 	{`[ABCDEFGH]`, `[A-H]`},
@@ -120,7 +121,8 @@ var simplifyTests = []struct {
 	// interesting than they might otherwise be. String inserts
 	// explicit (?:) in place of non-parenthesized empty strings,
 	// to make them easier to spot for other parsers.
-	{`(a|b|)`, `([a-b]|(?:))`},
+	{`(a|b|c|)`, `([a-c]|(?:))`},
+	{`(a|b|)`, `([ab]|(?:))`},
 	{`(|)`, `()`},
 	{`a()`, `a()`},
 	{`(()|())`, `(()|())`},