Skip to content

Commit d03c59d

Browse files
committed
gopls/internal/lsp/source/typerefs: serialization support
This change divides the API into Encode and Decode. Encode computes the reference graph and encodes it as a serializable (gob-encoded) message. Decode parses this message and returns a data structure that can be used for the query. Decode requires a PackageIndex. This change also implements coalescing of equivalence classes, each relating a set of Decls (names of package-level declarations) to the set of Symbols (package+name of imported symbols) that they reference. Because many declarations reference the same set of symbols, this reduces the space requirements for both serialization and in-memory representation, and allows callers to exploit this reduction by avoiding repeated iteration of subgraphs. I was initially planning to record the result in a graph form, to avoid the M*N explosion when M decls each references N names, but I'm not yet convinced it's worth it: in practice, the exact same sets keep appearing, and this optimization tackles that case. There are many further optimizations we could perform, but it may be easier to prioritize and evaluate them once we have integrated typerefs into gopls. Change-Id: I0eb554730712bde2b7ffc22e5e38c22ac3a84cdd Reviewed-on: https://go-review.googlesource.com/c/tools/+/482675 Auto-Submit: Alan Donovan <[email protected]> TryBot-Result: Gopher Robot <[email protected]> Reviewed-by: Robert Findley <[email protected]> gopls-CI: kokoro <[email protected]> Run-TryBot: Alan Donovan <[email protected]>
1 parent 6492058 commit d03c59d

File tree

6 files changed

+442
-205
lines changed

6 files changed

+442
-205
lines changed

gopls/internal/lsp/filecache/filecache.go

+7-5
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,9 @@ func Get(kind string, key [32]byte) ([]byte, error) {
7272
return nil, ErrNotFound // cache entry is incomplete (or too long!)
7373
}
7474

75-
// Check for corruption and print the entire file content as
76-
// this may help us observe the pattern. See issue #59289.
75+
// Check for corruption and print the entire file content; see
76+
// issue #59289. TODO(adonovan): stop printing the entire file
77+
// once we've seen enough reports to understand the pattern.
7778
if binary.LittleEndian.Uint32(checksum) != crc32.ChecksumIEEE(value) {
7879
return nil, bug.Errorf("internal error in filecache.Get(%q, %x): invalid checksum at end of %d-byte file %s:\n%q",
7980
kind, key, len(data), name, data)
@@ -119,9 +120,10 @@ func Set(kind string, key [32]byte, value []byte) error {
119120
// assumed due to a nonatomicity problem in the file system.
120121
// Ideally the macOS kernel would be fixed, or lockedfile
121122
// would implement a workaround (since its job is to provide
122-
// reliable atomic file replacement atop kernels that don't),
123-
// but for now we add an extra integrity check: a 32-bit
124-
// checksum at the end.
123+
// reliable the mutual exclusion primitive that allows
124+
// cooperating gopls processes to implement transactional
125+
// file replacement), but for now we add an extra integrity
126+
// check: a 32-bit checksum at the end.
125127
var checksum [4]byte
126128
binary.LittleEndian.PutUint32(checksum[:], crc32.ChecksumIEEE(value))
127129

gopls/internal/lsp/source/typerefs/doc.go

+50-55
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
// Package typerefs extracts from Go syntax a graph of symbol-level
6-
// dependencies, for the purpose of precise invalidation of package data.
5+
// Package typerefs extracts symbol-level reachability information
6+
// from the syntax of a Go package.
77
//
88
// # Background
99
//
@@ -15,8 +15,8 @@
1515
// More precisely, for each package P we define the set of "reachable" packages
1616
// from P as the set of packages that may affect the (deep) export data of the
1717
// direct dependencies of P. By this definition, the complement of this set
18-
// cannot affect any information derived from type checking P (e.g.
19-
// diagnostics, cross references, or method sets). Therefore we need not
18+
// cannot affect any information derived from type checking P, such as
19+
// diagnostics, cross references, or method sets. Therefore we need not
2020
// invalidate any results for P when a package in the complement of this set
2121
// changes.
2222
//
@@ -26,11 +26,12 @@
2626
// dotted identifiers referenced in the declaration of D, that may affect
2727
// the type of D. However, these references reflect only local knowledge of the
2828
// package and its dependency metadata, and do not depend on any analysis of
29-
// the dependencies themselves.
29+
// the dependencies themselves. This allows the reference information for
30+
// a package to be cached independent of all others.
3031
//
3132
// Specifically, if a referring identifier I appears in the declaration, we
3233
// record an edge from D to each object possibly referenced by I. We search for
33-
// references within type syntax, but do not actual type-check, so we can't
34+
// references within type syntax, but do not actually type-check, so we can't
3435
// reliably determine whether an expression is a type or a term, or whether a
3536
// function is a builtin or generic. For example, the type of x in var x =
3637
// p.F(W) only depends on W if p.F is a builtin or generic function, which we
@@ -39,15 +40,16 @@
3940
//
4041
// - If I is declared in the current package, record a reference to its
4142
// declaration.
42-
// - Else, if there are any dot-imported imports in the current file and I is
43-
// exported, record a (possibly dangling) edge to the corresponding
44-
// declaration in each dot-imported package.
43+
// - Otherwise, if there are any dot imports in the current
44+
// file and I is exported, record a (possibly dangling) edge to
45+
// the corresponding declaration in each dot-imported package.
4546
//
4647
// If a dotted identifier q.I appears in the declaration, we
4748
// perform a similar operation:
49+
//
4850
// - If q is declared in the current package, we record a reference to that
4951
// object. It may be a var or const that has a field or method I.
50-
// - Else, if q is a valid import name based on imports in the current file
52+
// - Otherwise, if q is a valid import name based on imports in the current file
5153
// and the provided metadata for dependency package names, record a
5254
// reference to the object I in that package.
5355
// - Additionally, handle the case where Q is exported, and Q.I may refer to
@@ -62,56 +64,49 @@
6264
// # Graph optimizations
6365
//
6466
// The references extracted from the syntax are used to construct
65-
// edges between declNodes. Edges are of two kinds: internal
66-
// references, from one package-level declaration to another; and
67-
// external references, from a symbol in this package to a symbol
68-
// imported from a direct dependency.
67+
// edges between nodes representing declarations. Edges are of two
68+
// kinds: internal references, from one package-level declaration to
69+
// another; and external references, from a symbol in this package to
70+
// a symbol imported from a direct dependency.
6971
//
7072
// Once the symbol reference graph is constructed, we find its
71-
// strongly connected components (SCCs) using Tarjan's algorithm. A
72-
// node from each SCC is chosen arbitrarily to be its representative,
73-
// and all the edges (internal and external) of the SCC are
74-
// accumulated into the representative, thus forming the strong
75-
// component graph, which is acyclic. This property simplifies the
76-
// logic and improves the efficiency of the reachability query.
77-
//
78-
// TODO(adonovan): opt: subsequent planned optimizations include:
79-
//
80-
// - The Hash-Value Numbering optimization described in
81-
// Hardekopf and Lin; see golang.org/x/go/pointer/hvn.go for an
82-
// implementation. (Like pointer analysis, our problem is
83-
// fundamentally one of graph reachability.)
84-
//
85-
// The "pointer equivalence" (PE) portion of this algorithm uses a
86-
// hash table to create a mapping from unique sets of external
87-
// references to small integers. Each of the n external symbols
88-
// referenced by the package is assigned a integer from 1 to n;
89-
// this number stands for a singleton set. Higher numbers refer to
90-
// unions of strictly smaller sets. The PE algorithm allows us to
91-
// coalesce redundant graph nodes. For example, all functions that
92-
// ultimately reference only {fmt.Println,fmt.Sprintf} would be
93-
// marked as equivalent to each other, and to the union of
94-
// the sets of {fmt.Sprint} and {fmt.Println}.
95-
//
96-
// This reduces the worst-case size of the Refs() result. Consider
97-
// M decls that each reference type t, which references N imported
98-
// types. The source code has O(M + N) lines but the Refs result
99-
// is current O(M*N). Preserving the essential structure of the
100-
// reference graph (as a DAG of union operations) will reduce the
101-
// asymptote.
102-
//
103-
// - Serializing the SC graph obtained each package and saving it in
104-
// the file cache. Once we have a DAG of unions, we can serialize
105-
// it easily and amortize the cost of the local preprocessing.
73+
// strongly connected components (SCCs) using Tarjan's algorithm.
74+
// As we coalesce the nodes of each SCC we compute the union of
75+
// external references reached by each package-level declaration.
76+
// The final result is the mapping from each exported package-level
77+
// declaration to the set of external (imported) declarations that it
78+
// reaches.
79+
//
80+
// Because it is common for many package members to have the same
81+
// reachability, the result takes the form of a set of equivalence
82+
// classes, each mapping a set of package-level declarations to a set
83+
// of external symbols. We use a hash table to canonicalize sets so that
84+
// repeated occurrences of the same set (which are common) are only
85+
// represented once in memory or in the file system.
86+
// For example, all declarations that ultimately reference only
87+
// {fmt.Println,strings.Join} would be classed as equivalent.
88+
//
89+
// This approach was inspired by the Hash-Value Numbering (HVN)
90+
// optimization described by Hardekopf and Lin. See
91+
// golang.org/x/tools/go/pointer/hvn.go for an implementation. (Like
92+
// pointer analysis, this problem is fundamentally one of graph
93+
// reachability.) The HVN algorithm takes the compression a step
94+
// further by preserving the topology of the SCC DAG, in which edges
95+
// represent "is a superset of" constraints. Redundant edges that
96+
// don't increase the solution can be deleted. We could apply the same
97+
// technique here to further reduce the worst-case size of the result,
98+
// but the current implementation seems adequate.
10699
//
107100
// # API
108101
//
109-
// The main entry point for this analysis is the [Refs] function, which
110-
// implements the aforementioned syntactic analysis for a set of files
111-
// constituting a package.
102+
// The main entry point for this analysis is the [Encode] function,
103+
// which implements the analysis described above for one package, and
104+
// encodes the result as a binary message.
112105
//
113-
// These references use shared state to efficiently represent references, by
114-
// way of the [PackageIndex] and [PackageSet] types.
106+
// The [Decode] function decodes the message into a usable form: a set
107+
// of equivalence classes. The decoder uses a shared [PackageIndex] to
108+
// enable more compact representations of sets of packages
109+
// ([PackageSet]) during the global reacahability computation.
115110
//
116111
// The [BuildPackageGraph] constructor implements a whole-graph analysis similar
117112
// to that which will be implemented by gopls, but for various reasons the
@@ -120,7 +115,7 @@
120115
// BuildPackageGraph and its test serve to verify the syntactic analysis, and
121116
// may serve as a proving ground for new optimizations of the whole-graph analysis.
122117
//
123-
// # Comparison with export data
118+
// # Export data is insufficient
124119
//
125120
// At first it may seem that the simplest way to implement this analysis would
126121
// be to consider the types.Packages of the dependencies of P, for example

gopls/internal/lsp/source/typerefs/packageset.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -68,11 +68,11 @@ type PackageSet struct {
6868
type blockType = uint // type of each sparse vector element
6969
const blockSize = bits.UintSize
7070

71-
// New creates a new PackageSet bound to this PackageIndex instance.
71+
// NewSet creates a new PackageSet bound to this PackageIndex instance.
7272
//
7373
// PackageSets may only be combined with other PackageSets from the same
7474
// instance.
75-
func (s *PackageIndex) New() *PackageSet {
75+
func (s *PackageIndex) NewSet() *PackageSet {
7676
return &PackageSet{
7777
parent: s,
7878
sparse: make(map[int]blockType),

gopls/internal/lsp/source/typerefs/pkgrefs.go

+29-26
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ const (
1919
//
2020
// Warning: produces a lot of output! Best to run with small package queries.
2121
trace = false
22+
23+
// debug enables additional assertions.
24+
debug = false
2225
)
2326

2427
// A Package holds reference information for a single package.
@@ -135,48 +138,48 @@ func (g *PackageGraph) buildPackage(ctx context.Context, id source.PackageID) (*
135138
}
136139

137140
// Compute the symbol-level dependencies through this package.
138-
//
139-
// refs records syntactic edges between declarations in this
140-
// package and declarations in this package or another
141-
// package. See the package documentation for a detailed
142-
// description of what these edges do (and do not) represent.
143-
//
144-
// TODO(adonovan): opt: serialize and deserialize the refs
145-
// result computed above and persist it in the filecache.
146-
refs := Refs(files, id, imports)
141+
// TODO(adonovan): opt: persist this in the filecache, keyed
142+
// by hash(id, CompiledGoFiles, imports).
143+
data := Encode(files, id, imports)
147144

148145
// This point separates the local preprocessing
149146
// -- of a single package (above) from the global --
150147
// transitive reachability query (below).
151148

149+
// classes records syntactic edges between declarations in this
150+
// package and declarations in this package or another
151+
// package. See the package documentation for a detailed
152+
// description of what these edges do (and do not) represent.
153+
classes := Decode(g.pkgIndex, id, data)
154+
155+
idx := g.pkgIndex.idx(id)
156+
152157
// Now compute the transitive closure of packages reachable
153158
// from any exported symbol of this package.
154-
//
155-
// TODO(adonovan): opt: many elements of refs[name] are
156-
// identical, so this does redundant work. Choose a data type
157-
// for the result of Refs() that expresses the M:N structure
158-
// explicitly.
159-
for name, nodes := range refs {
160-
set := g.pkgIndex.New()
161-
162-
// The nodes slice is sorted by (package, name),
159+
for _, class := range classes {
160+
set := g.pkgIndex.NewSet()
161+
162+
// The Refs slice is sorted by (PackageID, name),
163163
// so we can economize by calling g.Package only
164164
// when the package id changes.
165165
depP := p
166-
for _, node := range nodes {
167-
assert(node.PkgID != id, "intra-package edge")
168-
if depP.metadata.ID != node.PkgID {
166+
for _, sym := range class.Refs {
167+
assert(sym.pkgIdx != idx, "intra-package edge")
168+
symPkgID := g.pkgIndex.id(sym.pkgIdx)
169+
if depP.metadata.ID != symPkgID {
169170
// package changed
170171
var err error
171-
depP, err = g.Package(ctx, node.PkgID)
172+
depP, err = g.Package(ctx, symPkgID)
172173
if err != nil {
173174
return nil, err
174175
}
175176
}
176-
set.add(g.pkgIndex.idx(node.PkgID))
177-
set.Union(depP.transitiveRefs[node.Name])
177+
set.add(sym.pkgIdx)
178+
set.Union(depP.transitiveRefs[sym.Name])
179+
}
180+
for _, name := range class.Decls {
181+
p.transitiveRefs[name] = set
178182
}
179-
p.transitiveRefs[name] = set
180183
}
181184

182185
// Finally compute the union of transitiveRefs
@@ -193,7 +196,7 @@ func (g *PackageGraph) buildPackage(ctx context.Context, id source.PackageID) (*
193196
// reachesByDeps computes the set of packages that are reachable through
194197
// dependencies of the package m.
195198
func (g *PackageGraph) reachesByDeps(ctx context.Context, m *source.Metadata) (*PackageSet, error) {
196-
transitive := g.pkgIndex.New()
199+
transitive := g.pkgIndex.NewSet()
197200
for _, depID := range m.DepsByPkgPath {
198201
dep, err := g.Package(ctx, depID)
199202
if err != nil {

0 commit comments

Comments
 (0)