diff options
| author | Jonathan Amsterdam <jba@google.com> | 2020-01-30 12:56:42 -0500 |
|---|---|---|
| committer | Julie Qiu <julie@golang.org> | 2020-03-27 16:46:52 -0400 |
| commit | a8ae5959deb7a8b06d00f3e65ba71800f4afaefb (patch) | |
| tree | 7fc3274e436a52274759d62b937faae594153bc0 /internal/postgres/versionstate.go | |
| parent | 4a4fa42a48095c52b19904fc0953daaf7db4104f (diff) | |
| download | go-x-pkgsite-a8ae5959deb7a8b06d00f3e65ba71800f4afaefb.tar.xz | |
internal/postgres: improve reprocessing order
It takes several days to reprocess all our data. Most of that is taken
up with numerous forks of kubernetes and other large modules. Many of
these are quickly marked as alternatives, but many older versions lack
a go.mod file, so we fully process each one, and they take a long
time.
Process the most recent versions of all modules first. Then process
all non-kubernetes, etc. modules. Leave non-latest versions of the large
modules for last.
Change-Id: Ibdfec3c5c3ac996d3cdcfd8988421db498e5e553
Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/653246
CI-Result: Cloud Build <devtools-proctor-result-processor@system.gserviceaccount.com>
Reviewed-by: Julie Qiu <julieqiu@google.com>
Diffstat (limited to 'internal/postgres/versionstate.go')
| -rw-r--r-- | internal/postgres/versionstate.go | 112 |
1 files changed, 106 insertions, 6 deletions
diff --git a/internal/postgres/versionstate.go b/internal/postgres/versionstate.go index f6a04207..377ba536 100644 --- a/internal/postgres/versionstate.go +++ b/internal/postgres/versionstate.go @@ -8,6 +8,8 @@ import ( "context" "database/sql" "fmt" + "io" + "strings" "time" "github.com/lib/pq" @@ -214,22 +216,120 @@ func (db *DB) queryModuleVersionStates(ctx context.Context, queryFormat string, // GetNextVersionsToFetch returns the next batch of versions that must be // processed. -// Prefer release versions to prerelease, and higher versions to lower. func (db *DB) GetNextVersionsToFetch(ctx context.Context, limit int) (_ []*internal.ModuleVersionState, err error) { + // We want to prioritize the latest versions over other ones, and we want to leave time-consuming + // modules until the end. So we run several queries in succession, appending their results until + // we reach the limit: latest release versions, latest non-release versions, everything else except + // slow modules, and finally the slow modules. defer derrors.Wrap(&err, "GetNextVersionsToFetch(ctx, %d)", limit) - queryFormat := ` - SELECT %s + // Query for getting latest release or non-release versions. The first + // argument to Sprintf will be the columns, the second the operator./ If the + // operator is "=", we will select all the relase versions; if "!=", then + // the non-release versions. + // + // Adding the nonsensical (and never true) "OR right(sort_version, 2) = ''" + // generates a much faster query plan for the '=' case, and preserves the fast + // query plan for the "!=" case. Go fig. + maxQuery := ` + WITH max_versions AS ( + SELECT module_path, max(sort_version) AS max_sv + FROM module_version_states + -- Compare the last character of sort_version to '~'. + WHERE right(sort_version, 1) %[2]s '~' OR right(sort_version, 2) = '' + GROUP BY 1 + ) + SELECT + %[1]s FROM - module_version_states + module_version_states s + INNER JOIN + max_versions m + ON + s.module_path = m.module_path + AND + s.sort_version = m.max_sv WHERE (status IS NULL OR status >= 500) AND next_processed_after < CURRENT_TIMESTAMP ORDER BY + s.sort_version DESC + LIMIT $1 + ` + // Query for including or excluding a list of module path patterns. + // The first argument to Sprintf will be the columns, the + // second the operator. + // If the operator is "NOT", then we will exclude the module_paths; + // it if is the empty string, we will include them. + // We need to double the percents in the LIKE expressions or Sprintf + // will try to interpret them. + pathQuery := ` + SELECT + %[1]s + FROM + module_version_states s + WHERE + (status IS NULL OR status >= 500) + AND next_processed_after < CURRENT_TIMESTAMP + AND %[2]s ( + module_path LIKE '%%/kubernetes' + OR + module_path LIKE '%%/aws-sdk-go' + ) + ORDER BY right(sort_version, 1) = '~' DESC, sort_version DESC - LIMIT $1` - return db.queryModuleVersionStates(ctx, queryFormat, limit) + LIMIT $1 + ` + // Prepend "s." to columns, because in maxQuery the bare column name is + // ambiguous. All the queries use `s` as an alias for module_version_states. + columnSlice := strings.Split(versionStateColumns, ",") + for i, c := range columnSlice { + columnSlice[i] = "s." + strings.TrimSpace(c) + } + columns := strings.Join(columnSlice, ", ") + + queries := []string{ + // latest release versions + fmt.Sprintf(maxQuery, columns, "="), + // latest non-release versions + fmt.Sprintf(maxQuery, columns, "!="), + // all other versions in order, except matching module paths + fmt.Sprintf(pathQuery, columns, "NOT"), + // all the module paths previously excluded + fmt.Sprintf(pathQuery, columns, ""), + } + + var mvs []*internal.ModuleVersionState + // Keep track of rows we've seen to dedup, because queries overlap. + seen := map[[2]string]bool{} + for i, q := range queries { + err := db.db.RunQuery(ctx, q, func(rows *sql.Rows) error { + if len(mvs) >= limit { + return io.EOF // signal that we're done + } + mv, err := scanModuleVersionState(rows.Scan) + if err != nil { + return err + } + key := [2]string{mv.ModulePath, mv.Version} + if !seen[key] { + mvs = append(mvs, mv) + seen[key] = true + } + return nil + }, limit) // Do not reduce the limit on each iteration, because the queries overlap. + switch err { + case io.EOF: + log.Infof(ctx, "GetNextVersionsToFetch: finished with query #%d", i) + return mvs, nil + case nil: + continue + default: + return nil, err + } + } + return mvs, nil } // GetRecentFailedVersions returns versions that have most recently failed. |
