diff options
| author | Julie Qiu <julie@golang.org> | 2020-04-14 17:55:11 -0400 |
|---|---|---|
| committer | Julie Qiu <julieqiu@google.com> | 2020-04-15 20:33:35 +0000 |
| commit | e90fb1c4adde446b8e963b04e01d8bc4f42a2b2b (patch) | |
| tree | 707907d1e36ad382905f40e03ff9080e39691a1d /internal/postgres/insert_module.go | |
| parent | 8c9ef27613ca4e26c101f2ac2267bed0260b6bd9 (diff) | |
| download | go-x-pkgsite-e90fb1c4adde446b8e963b04e01d8bc4f42a2b2b.tar.xz | |
internal/postgres: insert into paths, readmes, documentation, package_imports
InsertVersion now populates the following tables:
- paths
- readmes
- documentation
- package_imports
Change-Id: Ib02220101d148a261cf55bd564cb9f9a46e60544
Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/704882
Reviewed-by: Jonathan Amsterdam <jba@google.com>
Diffstat (limited to 'internal/postgres/insert_module.go')
| -rw-r--r-- | internal/postgres/insert_module.go | 565 |
1 files changed, 565 insertions, 0 deletions
diff --git a/internal/postgres/insert_module.go b/internal/postgres/insert_module.go new file mode 100644 index 00000000..1dbc854e --- /dev/null +++ b/internal/postgres/insert_module.go @@ -0,0 +1,565 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package postgres + +import ( + "context" + "database/sql" + "encoding/json" + "errors" + "fmt" + "sort" + "strings" + + "github.com/lib/pq" + "golang.org/x/discovery/internal" + "golang.org/x/discovery/internal/database" + "golang.org/x/discovery/internal/derrors" + "golang.org/x/discovery/internal/experiment" + "golang.org/x/discovery/internal/log" + "golang.org/x/discovery/internal/stdlib" + "golang.org/x/discovery/internal/version" + "golang.org/x/mod/module" + "golang.org/x/mod/semver" +) + +// InsertModule inserts a version into the database using +// db.saveVersion, along with a search document corresponding to each of its +// packages. +func (db *DB) InsertModule(ctx context.Context, m *internal.Module) (err error) { + defer func() { + if m == nil { + derrors.Wrap(&err, "DB.InsertModule(ctx, nil)") + } else { + derrors.Wrap(&err, "DB.InsertModule(ctx, Version(%q, %q))", m.ModulePath, m.Version) + } + }() + + if err := validateModule(m); err != nil { + return fmt.Errorf("validateVersion: %v: %w", err, derrors.InvalidArgument) + } + removeNonDistributableData(m) + + if err := db.saveModule(ctx, m); err != nil { + return err + } + + // If there is a more recent version of this module that has an alternative + // module path, then do not insert its packages into search_documents. This + // happens when a module that initially does not have a go.mod file is + // forked or fetched via some non-canonical path (such as an alternative + // capitalization), and then in a later version acquires a go.mod file. + // + // To take an actual example: github.com/sirupsen/logrus@v1.1.0 has a go.mod + // file that establishes that path as canonical. But v1.0.6 does not have a + // go.mod file. So the miscapitalized path github.com/Sirupsen/logrus at + // v1.1.0 is marked as an alternative path (code 491) by + // internal/fetch.FetchVersion and is not inserted into the DB, but at + // v1.0.6 it is considered valid, and we end up here. We still insert + // github.com/Sirupsen/logrus@v1.0.6 in the versions table and friends so + // that users who import it can find information about it, but we don't want + // it showing up in search results. + // + // Note that we end up here only if we first saw the alternative version + // (github.com/Sirupsen/logrus@v1.1.0 in the example) and then see the valid + // one. The "if code == 491" section of internal/worker.fetchAndUpdateState + // handles the case where we fetch the versions in the other order. + row := db.db.QueryRow(ctx, ` + SELECT 1 FROM module_version_states + WHERE module_path = $1 AND sort_version > $2 and status = 491`, + m.ModulePath, version.ForSorting(m.Version)) + var x int + if err := row.Scan(&x); err != sql.ErrNoRows { + log.Infof(ctx, "%s@%s: not inserting into search documents", m.ModulePath, m.Version) + return err + } + + // Insert the module's packages into search_documents. + return db.UpsertSearchDocuments(ctx, m) +} + +// saveModule inserts a Module into the database along with its packages, +// imports, and licenses. If any of these rows already exist, the module and +// corresponding will be deleted and reinserted. +// If the module is malformed then insertion will fail. +// +// A derrors.InvalidArgument error will be returned if the given module and +// licenses are invalid. +func (db *DB) saveModule(ctx context.Context, m *internal.Module) (err error) { + derrors.Wrap(&err, "DB.saveModule(ctx, Version(%q, %q))", m.ModulePath, m.Version) + if m.ReadmeContents == internal.StringFieldMissing { + // We don't expect this to ever happen here, but checking just in case. + return errors.New("saveModule: version missing ReadmeContents") + } + // Sort to ensure proper lock ordering, avoiding deadlocks. See + // b/141164828#comment8. The only deadlocks we've actually seen are on + // imports_unique, because they can occur when processing two versions of + // the same module, which happens regularly. But if we were ever to process + // the same module and version twice, we could see deadlocks in the other + // bulk inserts. + sort.Slice(m.Packages, func(i, j int) bool { + return m.Packages[i].Path < m.Packages[j].Path + }) + sort.Slice(m.Licenses, func(i, j int) bool { + return m.Licenses[i].FilePath < m.Licenses[j].FilePath + }) + for _, p := range m.Packages { + sort.Strings(p.Imports) + } + + return db.db.Transact(func(tx *database.DB) error { + // If the version exists, delete it to force an overwrite. This allows us + // to selectively repopulate data after a code change. + if err := db.DeleteModule(ctx, tx, m.ModulePath, m.Version); err != nil { + return fmt.Errorf("error deleting existing versions: %v", err) + } + moduleID, err := insertModule(ctx, tx, m) + if err != nil { + return err + } + if err := insertLicenses(ctx, tx, m, moduleID); err != nil { + return err + } + if err := insertPackages(ctx, tx, m); err != nil { + return err + } + if experiment.IsActive(ctx, internal.ExperimentInsertDirectories) { + if err := insertDirectories(ctx, tx, m, moduleID); err != nil { + return err + } + } + return nil + }) +} + +func insertModule(ctx context.Context, db *database.DB, m *internal.Module) (_ int, err error) { + defer derrors.Wrap(&err, "insertModule(ctx, %q, %q)", m.ModulePath, m.Version) + sourceInfoJSON, err := json.Marshal(m.SourceInfo) + if err != nil { + return 0, err + } + var moduleID int + err = db.QueryRow(ctx, + `INSERT INTO modules( + module_path, + version, + commit_time, + readme_file_path, + readme_contents, + sort_version, + version_type, + series_path, + source_info, + redistributable, + has_go_mod) + VALUES($1,$2,$3,$4,$5,$6,$7,$8,$9,$10, $11) + ON CONFLICT + (module_path, version) + DO UPDATE SET + readme_file_path=excluded.readme_file_path, + readme_contents=excluded.readme_contents, + source_info=excluded.source_info, + redistributable=excluded.redistributable + RETURNING id`, + m.ModulePath, + m.Version, + m.CommitTime, + m.ReadmeFilePath, + makeValidUnicode(m.ReadmeContents), + version.ForSorting(m.Version), + m.VersionType, + m.SeriesPath(), + sourceInfoJSON, + m.IsRedistributable, + m.HasGoMod, + ).Scan(&moduleID) + if err != nil { + return 0, err + } + return moduleID, nil +} + +func insertLicenses(ctx context.Context, db *database.DB, m *internal.Module, moduleID int) (err error) { + defer derrors.Wrap(&err, "insertLicenses(ctx, %q, %q)", m.ModulePath, m.Version) + var licenseValues []interface{} + for _, l := range m.Licenses { + covJSON, err := json.Marshal(l.Coverage) + if err != nil { + return fmt.Errorf("marshalling %+v: %v", l.Coverage, err) + } + licenseValues = append(licenseValues, m.ModulePath, m.Version, + l.FilePath, makeValidUnicode(string(l.Contents)), pq.Array(l.Types), covJSON, moduleID) + } + if len(licenseValues) > 0 { + licenseCols := []string{ + "module_path", + "version", + "file_path", + "contents", + "types", + "coverage", + "module_id", + } + return db.BulkInsert(ctx, "licenses", licenseCols, licenseValues, + database.OnConflictDoNothing) + } + return nil +} + +func insertPackages(ctx context.Context, db *database.DB, m *internal.Module) (err error) { + defer derrors.Wrap(&err, "insertPackages(ctx, %q, %q)", m.ModulePath, m.Version) + // We only insert into imports_unique if this is the latest + // version of the module. + isLatest, err := isLatestVersion(ctx, db, m.ModulePath, m.Version) + if err != nil { + return err + } + if isLatest { + // Remove the previous rows for this module. We'll replace them with + // new ones below. + if _, err := db.Exec(ctx, + `DELETE FROM imports_unique WHERE from_module_path = $1`, + m.ModulePath); err != nil { + return err + } + } + + var pkgValues, importValues, importUniqueValues []interface{} + for _, p := range m.Packages { + if p.DocumentationHTML == internal.StringFieldMissing { + return errors.New("saveModule: package missing DocumentationHTML") + } + var licenseTypes, licensePaths []string + for _, l := range p.Licenses { + if len(l.Types) == 0 { + // If a license file has no detected license types, we still need to + // record it as applicable to the package, because we want to fail + // closed (meaning if there is a LICENSE file containing unknown + // licenses, we assume them not to be permissive of redistribution.) + licenseTypes = append(licenseTypes, "") + licensePaths = append(licensePaths, l.FilePath) + } else { + for _, typ := range l.Types { + licenseTypes = append(licenseTypes, typ) + licensePaths = append(licensePaths, l.FilePath) + } + } + } + pkgValues = append(pkgValues, + p.Path, + p.Synopsis, + p.Name, + m.Version, + m.ModulePath, + p.V1Path, + p.IsRedistributable, + p.DocumentationHTML, + pq.Array(licenseTypes), + pq.Array(licensePaths), + p.GOOS, + p.GOARCH, + m.CommitTime, + ) + for _, i := range p.Imports { + importValues = append(importValues, p.Path, m.ModulePath, m.Version, i) + if isLatest { + importUniqueValues = append(importUniqueValues, p.Path, m.ModulePath, i) + } + } + } + if len(pkgValues) > 0 { + pkgCols := []string{ + "path", + "synopsis", + "name", + "version", + "module_path", + "v1_path", + "redistributable", + "documentation", + "license_types", + "license_paths", + "goos", + "goarch", + "commit_time", + } + if err := db.BulkInsert(ctx, "packages", pkgCols, pkgValues, database.OnConflictDoNothing); err != nil { + return err + } + } + + if len(importValues) > 0 { + importCols := []string{ + "from_path", + "from_module_path", + "from_version", + "to_path", + } + if err := db.BulkInsert(ctx, "imports", importCols, importValues, database.OnConflictDoNothing); err != nil { + return err + } + if len(importUniqueValues) > 0 { + importUniqueCols := []string{ + "from_path", + "from_module_path", + "to_path", + } + if err := db.BulkInsert(ctx, "imports_unique", importUniqueCols, importUniqueValues, database.OnConflictDoNothing); err != nil { + return err + } + } + } + return nil +} + +func insertDirectories(ctx context.Context, db *database.DB, m *internal.Module, moduleID int) (err error) { + defer derrors.Wrap(&err, "upsertDirectories(ctx, tx, %q, %q)", m.ModulePath, m.Version) + + var ( + paths []string + pathValues []interface{} + pathToReadme = map[string]*internal.Readme{} + pathToDoc = map[string]*internal.Documentation{} + pathToImports = map[string][]string{} + ) + for _, d := range m.Directories { + var licenseTypes, licensePaths []string + for _, l := range d.Licenses { + if len(l.Types) == 0 { + // If a license file has no detected license types, we still need to + // record it as applicable to the package, because we want to fail + // closed (meaning if there is a LICENSE file containing unknown + // licenses, we assume them not to be permissive of redistribution.) + licenseTypes = append(licenseTypes, "") + licensePaths = append(licensePaths, l.FilePath) + } else { + for _, typ := range l.Types { + licenseTypes = append(licenseTypes, typ) + licensePaths = append(licensePaths, l.FilePath) + } + } + } + var name string + if d.Package != nil { + name = d.Package.Name + } + pathValues = append(pathValues, + d.Path, + moduleID, + d.V1Path, + name, + pq.Array(licenseTypes), + pq.Array(licensePaths), + d.IsRedistributable, + ) + paths = append(paths, d.Path) + if d.Readme != nil { + pathToReadme[d.Path] = d.Readme + } + if d.Package != nil { + if d.Package.Documentation == nil || d.Package.Documentation.HTML == internal.StringFieldMissing { + return errors.New("saveModule: package missing DocumentationHTML") + } + pathToDoc[d.Path] = d.Package.Documentation + if len(d.Package.Imports) > 0 { + pathToImports[d.Path] = d.Package.Imports + } + } + } + if len(pathValues) > 0 { + pathCols := []string{ + "path", + "module_id", + "v1_path", + "name", + "license_types", + "license_paths", + "redistributable", + } + if err := db.BulkInsert(ctx, "paths", pathCols, pathValues, database.OnConflictDoNothing); err != nil { + return err + } + } + + // TODO: Update BulkInsert to support RETURNING, so that id and path + // are returned. + pathToID, err := getPathIDs(ctx, db, paths) + if err != nil { + return err + } + + if len(pathToReadme) > 0 { + var readmeValues []interface{} + for path, readme := range pathToReadme { + id := pathToID[path] + readmeValues = append(readmeValues, id, readme.Filepath, readme.Contents) + } + readmeCols := []string{ + "path_id", + "file_path", + "contents", + } + if err := db.BulkInsert(ctx, "readmes", readmeCols, readmeValues, database.OnConflictDoNothing); err != nil { + return err + } + } + + if len(pathToDoc) > 0 { + var docValues []interface{} + for path, doc := range pathToDoc { + id := pathToID[path] + docValues = append(docValues, id, doc.GOOS, doc.GOARCH, doc.Synopsis, doc.HTML) + } + docCols := []string{ + "path_id", + "goos", + "goarch", + "synopsis", + "html", + } + if err := db.BulkInsert(ctx, "documentation", docCols, docValues, database.OnConflictDoNothing); err != nil { + return err + } + } + + var importValues []interface{} + for pkgPath, imports := range pathToImports { + id := pathToID[pkgPath] + for _, toPath := range imports { + importValues = append(importValues, id, toPath) + } + importCols := []string{ + "path_id", + "to_path", + } + if err := db.BulkInsert(ctx, "package_imports", importCols, importValues, database.OnConflictDoNothing); err != nil { + return err + } + } + return nil +} + +func getPathIDs(ctx context.Context, db *database.DB, paths []string) (_ map[string]int, err error) { + defer derrors.Wrap(&err, "getPathIds(ctx, tx, %q)", paths) + + pathToID := map[string]int{} + collect := func(rows *sql.Rows) error { + var ( + id int + path string + ) + if err := rows.Scan(&path, &id); err != nil { + return fmt.Errorf("rows.Scan(): %v", err) + } + pathToID[path] = id + return nil + } + + if err := db.RunQuery(ctx, ` + SELECT path, id + FROM paths + WHERE path = ANY($1);`, collect, pq.Array(paths)); err != nil { + return nil, err + } + return pathToID, nil +} + +// isLatestVersion reports whether version is the latest version of the module. +func isLatestVersion(ctx context.Context, db *database.DB, modulePath, version string) (_ bool, err error) { + defer derrors.Wrap(&err, "isLatestVersion(ctx, tx, %q)", modulePath) + + row := db.QueryRow(ctx, ` + SELECT version FROM modules WHERE module_path = $1 + ORDER BY version_type = 'release' DESC, sort_version DESC + LIMIT 1`, + modulePath) + var v string + if err := row.Scan(&v); err != nil { + if err == sql.ErrNoRows { + return true, nil // It's the only version, so it's also the latest. + } + return false, err + } + return version == v, nil +} + +// validateModule checks that fields needed to insert a module into the +// database are present. Otherwise, it returns an error listing the reasons the +// module cannot be inserted. +func validateModule(m *internal.Module) error { + if m == nil { + return fmt.Errorf("nil module") + } + + var errReasons []string + if m.Version == "" { + errReasons = append(errReasons, "no specified version") + } + if m.ModulePath == "" { + errReasons = append(errReasons, "no module path") + } + if m.ModulePath != stdlib.ModulePath { + if err := module.CheckPath(m.ModulePath); err != nil { + errReasons = append(errReasons, fmt.Sprintf("invalid module path (%s)", err)) + } + if !semver.IsValid(m.Version) { + errReasons = append(errReasons, "invalid version") + } + } + if len(m.Packages) == 0 { + errReasons = append(errReasons, "module does not have any packages") + } + if m.CommitTime.IsZero() { + errReasons = append(errReasons, "empty commit time") + } + if len(errReasons) == 0 { + return nil + } + return fmt.Errorf("cannot insert module %q: %s", m.Version, strings.Join(errReasons, ", ")) +} + +// removeNonDistributableData removes any information from the version payload, +// after checking licenses. +func removeNonDistributableData(m *internal.Module) { + for _, p := range m.Packages { + if !p.IsRedistributable { + // Prune derived information that can't be stored. + p.Synopsis = "" + p.DocumentationHTML = "" + } + } + if !m.IsRedistributable { + m.ReadmeFilePath = "" + m.ReadmeContents = "" + } +} + +// DeleteModule deletes a Version from the database. +func (db *DB) DeleteModule(ctx context.Context, ddb *database.DB, modulePath, version string) (err error) { + defer derrors.Wrap(&err, "DB.DeleteModule(ctx, ddb, %q, %q)", modulePath, version) + + if ddb == nil { + ddb = db.db + } + // We only need to delete from the modules table. Thanks to ON DELETE + // CASCADE constraints, that will trigger deletions from all other tables. + const stmt = `DELETE FROM modules WHERE module_path=$1 AND version=$2` + _, err = ddb.Exec(ctx, stmt, modulePath, version) + return err +} + +// makeValidUnicode removes null runes from a string that will be saved in a +// column of type TEXT, because pq doesn't like them. It also replaces non-unicode +// characters with the Unicode replacement character, which is the behavior of +// for ... range on strings. +func makeValidUnicode(s string) string { + var b strings.Builder + for _, r := range s { + if r != 0 { + b.WriteRune(r) + } + } + return b.String() +} |
