gopls/internal/lsp/source/typerefs: serialization support

adonovan · adonovan · commit d03c59db96d2 · 2023-04-10T22:26:48.000Z
This change divides the API into Encode and Decode. Encode computes the reference graph and encodes it as a serializable (gob-encoded) message. Decode parses this message and returns a data structure that can be used for the query. Decode requires a PackageIndex. This change also implements coalescing of equivalence classes, each relating a set of Decls (names of package-level declarations) to the set of Symbols (package+name of imported symbols) that they reference. Because many declarations reference the same set of symbols, this reduces the space requirements for both serialization and in-memory representation, and allows callers to exploit this reduction by avoiding repeated iteration of subgraphs. I was initially planning to record the result in a graph form, to avoid the M*N explosion when M decls each references N names, but I'm not yet convinced it's worth it: in practice, the exact same sets keep appearing, and this optimization tackles that case. There are many further optimizations we could perform, but it may be easier to prioritize and evaluate them once we have integrated typerefs into gopls. Change-Id: I0eb554730712bde2b7ffc22e5e38c22ac3a84cdd Reviewed-on: https://go-review.googlesource.com/c/tools/+/482675 Auto-Submit: Alan Donovan <adonovan@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Robert Findley <rfindley@google.com> gopls-CI: kokoro <noreply+kokoro@google.com> Run-TryBot: Alan Donovan <adonovan@google.com>
diff --git a/gopls/internal/lsp/filecache/filecache.go b/gopls/internal/lsp/filecache/filecache.go
@@ -72,8 +72,9 @@ func Get(kind string, key [32]byte) ([]byte, error) {
 		return nil, ErrNotFound // cache entry is incomplete (or too long!)
 	}
 
-	// Check for corruption and print the entire file content as
-	// this may help us observe the pattern. See issue #59289.
+	// Check for corruption and print the entire file content; see
+	// issue #59289. TODO(adonovan): stop printing the entire file
+	// once we've seen enough reports to understand the pattern.
 	if binary.LittleEndian.Uint32(checksum) != crc32.ChecksumIEEE(value) {
 		return nil, bug.Errorf("internal error in filecache.Get(%q, %x): invalid checksum at end of %d-byte file %s:\n%q",
 			kind, key, len(data), name, data)
@@ -119,9 +120,10 @@ func Set(kind string, key [32]byte, value []byte) error {
 	// assumed due to a nonatomicity problem in the file system.
 	// Ideally the macOS kernel would be fixed, or lockedfile
 	// would implement a workaround (since its job is to provide
-	// reliable atomic file replacement atop kernels that don't),
-	// but for now we add an extra integrity check: a 32-bit
-	// checksum at the end.
+	// reliable the mutual exclusion primitive that allows
+	// cooperating gopls processes to implement transactional
+	// file replacement), but for now we add an extra integrity
+	// check: a 32-bit checksum at the end.
 	var checksum [4]byte
 	binary.LittleEndian.PutUint32(checksum[:], crc32.ChecksumIEEE(value))
 
diff --git a/gopls/internal/lsp/source/typerefs/doc.go b/gopls/internal/lsp/source/typerefs/doc.go
@@ -2,8 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Package typerefs extracts from Go syntax a graph of symbol-level
-// dependencies, for the purpose of precise invalidation of package data.
+// Package typerefs extracts symbol-level reachability information
+// from the syntax of a Go package.
 //
 // # Background
 //
@@ -15,8 +15,8 @@
 // More precisely, for each package P we define the set of "reachable" packages
 // from P as the set of packages that may affect the (deep) export data of the
 // direct dependencies of P. By this definition, the complement of this set
-// cannot affect any information derived from type checking P (e.g.
-// diagnostics, cross references, or method sets). Therefore we need not
+// cannot affect any information derived from type checking P, such as
+// diagnostics, cross references, or method sets. Therefore we need not
 // invalidate any results for P when a package in the complement of this set
 // changes.
 //
@@ -26,11 +26,12 @@
 // dotted identifiers referenced in the declaration of D, that may affect
 // the type of D. However, these references reflect only local knowledge of the
 // package and its dependency metadata, and do not depend on any analysis of
-// the dependencies themselves.
+// the dependencies themselves. This allows the reference information for
+// a package to be cached independent of all others.
 //
 // Specifically, if a referring identifier I appears in the declaration, we
 // record an edge from D to each object possibly referenced by I. We search for
-// references within type syntax, but do not actual type-check, so we can't
+// references within type syntax, but do not actually type-check, so we can't
 // reliably determine whether an expression is a type or a term, or whether a
 // function is a builtin or generic. For example, the type of x in var x =
 // p.F(W) only depends on W if p.F is a builtin or generic function, which we
@@ -39,15 +40,16 @@
 //
 //   - If I is declared in the current package, record a reference to its
 //     declaration.
-//   - Else, if there are any dot-imported imports in the current file and I is
-//     exported, record a (possibly dangling) edge to the corresponding
-//     declaration in each dot-imported package.
+//   - Otherwise, if there are any dot imports in the current
+//     file and I is exported, record a (possibly dangling) edge to
+//     the corresponding declaration in each dot-imported package.
 //
 // If a dotted identifier q.I appears in the declaration, we
 // perform a similar operation:
+//
 //   - If q is declared in the current package, we record a reference to that
 //     object. It may be a var or const that has a field or method I.
-//   - Else, if q is a valid import name based on imports in the current file
+//   - Otherwise, if q is a valid import name based on imports in the current file
 //     and the provided metadata for dependency package names, record a
 //     reference to the object I in that package.
 //   - Additionally, handle the case where Q is exported, and Q.I may refer to
@@ -62,56 +64,49 @@
 // # Graph optimizations
 //
 // The references extracted from the syntax are used to construct
-// edges between declNodes. Edges are of two kinds: internal
-// references, from one package-level declaration to another; and
-// external references, from a symbol in this package to a symbol
-// imported from a direct dependency.
+// edges between nodes representing declarations. Edges are of two
+// kinds: internal references, from one package-level declaration to
+// another; and external references, from a symbol in this package to
+// a symbol imported from a direct dependency.
 //
 // Once the symbol reference graph is constructed, we find its
-// strongly connected components (SCCs) using Tarjan's algorithm. A
-// node from each SCC is chosen arbitrarily to be its representative,
-// and all the edges (internal and external) of the SCC are
-// accumulated into the representative, thus forming the strong
-// component graph, which is acyclic. This property simplifies the
-// logic and improves the efficiency of the reachability query.
-//
-// TODO(adonovan): opt: subsequent planned optimizations include:
-//
-//   - The Hash-Value Numbering optimization described in
-//     Hardekopf and Lin; see golang.org/x/go/pointer/hvn.go for an
-//     implementation. (Like pointer analysis, our problem is
-//     fundamentally one of graph reachability.)
-//
-//     The "pointer equivalence" (PE) portion of this algorithm uses a
-//     hash table to create a mapping from unique sets of external
-//     references to small integers. Each of the n external symbols
-//     referenced by the package is assigned a integer from 1 to n;
-//     this number stands for a singleton set. Higher numbers refer to
-//     unions of strictly smaller sets. The PE algorithm allows us to
-//     coalesce redundant graph nodes. For example, all functions that
-//     ultimately reference only {fmt.Println,fmt.Sprintf} would be
-//     marked as equivalent to each other, and to the union of
-//     the sets of {fmt.Sprint} and {fmt.Println}.
-//
-//     This reduces the worst-case size of the Refs() result. Consider
-//     M decls that each reference type t, which references N imported
-//     types. The source code has O(M + N) lines but the Refs result
-//     is current O(M*N). Preserving the essential structure of the
-//     reference graph (as a DAG of union operations) will reduce the
-//     asymptote.
-//
-//   - Serializing the SC graph obtained each package and saving it in
-//     the file cache. Once we have a DAG of unions, we can serialize
-//     it easily and amortize the cost of the local preprocessing.
+// strongly connected components (SCCs) using Tarjan's algorithm.
+// As we coalesce the nodes of each SCC we compute the union of
+// external references reached by each package-level declaration.
+// The final result is the mapping from each exported package-level
+// declaration to the set of external (imported) declarations that it
+// reaches.
+//
+// Because it is common for many package members to have the same
+// reachability, the result takes the form of a set of equivalence
+// classes, each mapping a set of package-level declarations to a set
+// of external symbols. We use a hash table to canonicalize sets so that
+// repeated occurrences of the same set (which are common) are only
+// represented once in memory or in the file system.
+// For example, all declarations that ultimately reference only
+// {fmt.Println,strings.Join} would be classed as equivalent.
+//
+// This approach was inspired by the Hash-Value Numbering (HVN)
+// optimization described by Hardekopf and Lin. See
+// golang.org/x/tools/go/pointer/hvn.go for an implementation. (Like
+// pointer analysis, this problem is fundamentally one of graph
+// reachability.) The HVN algorithm takes the compression a step
+// further by preserving the topology of the SCC DAG, in which edges
+// represent "is a superset of" constraints. Redundant edges that
+// don't increase the solution can be deleted. We could apply the same
+// technique here to further reduce the worst-case size of the result,
+// but the current implementation seems adequate.
 //
 // # API
 //
-// The main entry point for this analysis is the [Refs] function, which
-// implements the aforementioned syntactic analysis for a set of files
-// constituting a package.
+// The main entry point for this analysis is the [Encode] function,
+// which implements the analysis described above for one package, and
+// encodes the result as a binary message.
 //
-// These references use shared state to efficiently represent references, by
-// way of the [PackageIndex] and [PackageSet] types.
+// The [Decode] function decodes the message into a usable form: a set
+// of equivalence classes. The decoder uses a shared [PackageIndex] to
+// enable more compact representations of sets of packages
+// ([PackageSet]) during the global reacahability computation.
 //
 // The [BuildPackageGraph] constructor implements a whole-graph analysis similar
 // to that which will be implemented by gopls, but for various reasons the
@@ -120,7 +115,7 @@
 // BuildPackageGraph and its test serve to verify the syntactic analysis, and
 // may serve as a proving ground for new optimizations of the whole-graph analysis.
 //
-// # Comparison with export data
+// # Export data is insufficient
 //
 // At first it may seem that the simplest way to implement this analysis would
 // be to consider the types.Packages of the dependencies of P, for example
diff --git a/gopls/internal/lsp/source/typerefs/packageset.go b/gopls/internal/lsp/source/typerefs/packageset.go
@@ -68,11 +68,11 @@ type PackageSet struct {
 type blockType = uint // type of each sparse vector element
 const blockSize = bits.UintSize
 
-// New creates a new PackageSet bound to this PackageIndex instance.
+// NewSet creates a new PackageSet bound to this PackageIndex instance.
 //
 // PackageSets may only be combined with other PackageSets from the same
 // instance.
-func (s *PackageIndex) New() *PackageSet {
+func (s *PackageIndex) NewSet() *PackageSet {
 	return &PackageSet{
 		parent: s,
 		sparse: make(map[int]blockType),
diff --git a/gopls/internal/lsp/source/typerefs/pkgrefs.go b/gopls/internal/lsp/source/typerefs/pkgrefs.go
@@ -19,6 +19,9 @@ const (
 	//
 	// Warning: produces a lot of output! Best to run with small package queries.
 	trace = false
+
+	// debug enables additional assertions.
+	debug = false
 )
 
 // A Package holds reference information for a single package.
@@ -135,48 +138,48 @@ func (g *PackageGraph) buildPackage(ctx context.Context, id source.PackageID) (*
 	}
 
 	// Compute the symbol-level dependencies through this package.
-	//
-	// refs records syntactic edges between declarations in this
-	// package and declarations in this package or another
-	// package. See the package documentation for a detailed
-	// description of what these edges do (and do not) represent.
-	//
-	// TODO(adonovan): opt: serialize and deserialize the refs
-	// result computed above and persist it in the filecache.
-	refs := Refs(files, id, imports)
+	// TODO(adonovan): opt: persist this in the filecache, keyed
+	// by hash(id, CompiledGoFiles, imports).
+	data := Encode(files, id, imports)
 
 	//      This point separates the local preprocessing
 	//  --  of a single package (above) from the global   --
 	//      transitive reachability query (below).
 
+	// classes records syntactic edges between declarations in this
+	// package and declarations in this package or another
+	// package. See the package documentation for a detailed
+	// description of what these edges do (and do not) represent.
+	classes := Decode(g.pkgIndex, id, data)
+
+	idx := g.pkgIndex.idx(id)
+
 	// Now compute the transitive closure of packages reachable
 	// from any exported symbol of this package.
-	//
-	// TODO(adonovan): opt: many elements of refs[name] are
-	// identical, so this does redundant work. Choose a data type
-	// for the result of Refs() that expresses the M:N structure
-	// explicitly.
-	for name, nodes := range refs {
-		set := g.pkgIndex.New()
-
-		// The nodes slice is sorted by (package, name),
+	for _, class := range classes {
+		set := g.pkgIndex.NewSet()
+
+		// The Refs slice is sorted by (PackageID, name),
 		// so we can economize by calling g.Package only
 		// when the package id changes.
 		depP := p
-		for _, node := range nodes {
-			assert(node.PkgID != id, "intra-package edge")
-			if depP.metadata.ID != node.PkgID {
+		for _, sym := range class.Refs {
+			assert(sym.pkgIdx != idx, "intra-package edge")
+			symPkgID := g.pkgIndex.id(sym.pkgIdx)
+			if depP.metadata.ID != symPkgID {
 				// package changed
 				var err error
-				depP, err = g.Package(ctx, node.PkgID)
+				depP, err = g.Package(ctx, symPkgID)
 				if err != nil {
 					return nil, err
 				}
 			}
-			set.add(g.pkgIndex.idx(node.PkgID))
-			set.Union(depP.transitiveRefs[node.Name])
+			set.add(sym.pkgIdx)
+			set.Union(depP.transitiveRefs[sym.Name])
+		}
+		for _, name := range class.Decls {
+			p.transitiveRefs[name] = set
 		}
-		p.transitiveRefs[name] = set
 	}
 
 	// Finally compute the union of transitiveRefs
@@ -193,7 +196,7 @@ func (g *PackageGraph) buildPackage(ctx context.Context, id source.PackageID) (*
 // reachesByDeps computes the set of packages that are reachable through
 // dependencies of the package m.
 func (g *PackageGraph) reachesByDeps(ctx context.Context, m *source.Metadata) (*PackageSet, error) {
-	transitive := g.pkgIndex.New()
+	transitive := g.pkgIndex.NewSet()
 	for _, depID := range m.DepsByPkgPath {
 		dep, err := g.Package(ctx, depID)
 		if err != nil {
diff --git a/gopls/internal/lsp/source/typerefs/refs.go b/gopls/internal/lsp/source/typerefs/refs.go
diff --git a/gopls/internal/lsp/source/typerefs/refs_test.go b/gopls/internal/lsp/source/typerefs/refs_test.go