-
-
Notifications
You must be signed in to change notification settings - Fork 5.8k
Proof of concept: Experimental support for git commit graph files #6701
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
package commitgraph | ||
|
||
import ( | ||
"encoding/binary" | ||
"hash" | ||
"hash/fnv" | ||
|
||
"github.com/dchest/siphash" | ||
) | ||
|
||
type filter struct { | ||
m uint32 | ||
k uint32 | ||
h hash.Hash64 | ||
} | ||
|
||
func (f *filter) bits(data []byte) []uint32 { | ||
f.h.Reset() | ||
f.h.Write(data) | ||
d := f.h.Sum(nil) | ||
a := binary.BigEndian.Uint32(d[4:8]) | ||
b := binary.BigEndian.Uint32(d[0:4]) | ||
is := make([]uint32, f.k) | ||
for i := uint32(0); i < f.k; i++ { | ||
is[i] = (a + b*i) % f.m | ||
} | ||
return is | ||
} | ||
|
||
func newFilter(m, k uint32) *filter { | ||
return &filter{ | ||
m: m, | ||
k: k, | ||
h: fnv.New64(), | ||
} | ||
} | ||
|
||
// BloomPathFilter is a probabilistic data structure that helps determining | ||
// whether a path was was changed. | ||
// | ||
// The implementation uses a standard bloom filter with n=512, m=10, k=7 | ||
// parameters using the 64-bit SipHash hash function with zero key. | ||
type BloomPathFilter struct { | ||
b []byte | ||
} | ||
|
||
// Test checks whether a path was previously added to the filter. Returns | ||
// false if the path is not present in the filter. Returns true if the path | ||
// could be present in the filter. | ||
func (f *BloomPathFilter) Test(path string) bool { | ||
d := siphash.Hash(0, 0, []byte(path)) | ||
a := uint32(d) | ||
b := uint32(d >> 32) | ||
var i uint32 | ||
for i = 0; i < 7; i++ { | ||
bit := (a + b*i) % 5120 | ||
if f.b[bit>>3]&(1<<(bit&7)) == 0 { | ||
return false | ||
} | ||
} | ||
return true | ||
} | ||
|
||
// Add path data to the filter. | ||
func (f *BloomPathFilter) Add(path string) { | ||
d := siphash.Hash(0, 0, []byte(path)) | ||
a := uint32(d) | ||
b := uint32(d >> 32) | ||
var i uint32 | ||
for i = 0; i < 7; i++ { | ||
bit := (a + b*i) % 5120 | ||
f.b[bit>>3] |= 1 << (bit & 7) | ||
} | ||
} | ||
|
||
// Data returns data bytes | ||
func (f *BloomPathFilter) Data() []byte { | ||
return f.b | ||
} | ||
|
||
// NewBloomPathFilter creates a new empty bloom filter | ||
func NewBloomPathFilter() *BloomPathFilter { | ||
f := &BloomPathFilter{make([]byte, 640)} | ||
return f | ||
} | ||
|
||
// LoadBloomPathFilter creates a bloom filter from a byte array previously | ||
// returned by Data | ||
func LoadBloomPathFilter(data []byte) *BloomPathFilter { | ||
f := &BloomPathFilter{data} | ||
return f | ||
} |
38 changes: 38 additions & 0 deletions
38
modules/commitgraph/plumbing/format/commitgraph/commitgraph.go
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
package commitgraph | ||
|
||
import ( | ||
"time" | ||
|
||
"gopkg.in/src-d/go-git.v4/plumbing" | ||
) | ||
|
||
// Node is a reduced representation of Commit as presented in the commit graph | ||
// file. It is merely useful as an optimization for walking the commit graphs. | ||
type Node struct { | ||
// TreeHash is the hash of the root tree of the commit. | ||
TreeHash plumbing.Hash | ||
// ParentIndexes are the indexes of the parent commits of the commit. | ||
ParentIndexes []int | ||
// ParentHashes are the hashes of the parent commits of the commit. | ||
ParentHashes []plumbing.Hash | ||
// Generation number is the pre-computed generation in the commit graph | ||
// or zero if not available | ||
Generation int | ||
// When is the timestamp of the commit. | ||
When time.Time | ||
} | ||
|
||
// Index represents a representation of commit graph that allows indexed | ||
// access to the nodes using commit object hash | ||
type Index interface { | ||
// GetIndexByHash gets the index in the commit graph from commit hash, if available | ||
GetIndexByHash(h plumbing.Hash) (int, error) | ||
// GetNodeByIndex gets the commit node from the commit graph using index | ||
// obtained from child node, if available | ||
GetNodeByIndex(i int) (*Node, error) | ||
// Hashes returns all the hashes that are available in the index | ||
Hashes() []plumbing.Hash | ||
|
||
// GetBloomFilterByIndex gets the bloom filter for files changed in the commit, if available | ||
GetBloomFilterByIndex(i int) (*BloomPathFilter, error) | ||
} |
35 changes: 35 additions & 0 deletions
35
modules/commitgraph/plumbing/format/commitgraph/commitgraph_test.go
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
package commitgraph_test | ||
|
||
import ( | ||
"testing" | ||
|
||
"code.gitea.io/gitea/modules/commitgraph/plumbing/format/commitgraph" | ||
"golang.org/x/exp/mmap" | ||
|
||
. "gopkg.in/check.v1" | ||
"gopkg.in/src-d/go-git-fixtures.v3" | ||
"gopkg.in/src-d/go-git.v4/plumbing" | ||
) | ||
|
||
func Test(t *testing.T) { TestingT(t) } | ||
|
||
type CommitgraphSuite struct { | ||
fixtures.Suite | ||
} | ||
|
||
var _ = Suite(&CommitgraphSuite{}) | ||
|
||
func (s *CommitgraphSuite) TestDecode(c *C) { | ||
reader, err := mmap.Open("..\\..\\tests\\testgit\\objects\\info\\commit-graph") | ||
c.Assert(err, IsNil) | ||
index, err := commitgraph.OpenFileIndex(reader) | ||
c.Assert(err, IsNil) | ||
|
||
nodeIndex, err := index.GetIndexByHash(plumbing.NewHash("5aa811d3c2f6d5d6e928a4acacd15248928c26d0")) | ||
c.Assert(err, IsNil) | ||
node, err := index.GetNodeByIndex(nodeIndex) | ||
c.Assert(err, IsNil) | ||
c.Assert(len(node.ParentIndexes), Equals, 0) | ||
|
||
reader.Close() | ||
} |
197 changes: 197 additions & 0 deletions
197
modules/commitgraph/plumbing/format/commitgraph/encoder.go
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,197 @@ | ||
package commitgraph | ||
|
||
import ( | ||
"bytes" | ||
"crypto/sha1" | ||
"hash" | ||
"io" | ||
"math" | ||
|
||
"gopkg.in/src-d/go-git.v4/plumbing" | ||
"gopkg.in/src-d/go-git.v4/utils/binary" | ||
) | ||
|
||
// Encoder writes MemoryIndex structs to an output stream. | ||
type Encoder struct { | ||
io.Writer | ||
hash hash.Hash | ||
} | ||
|
||
// NewEncoder returns a new stream encoder that writes to w. | ||
func NewEncoder(w io.Writer) *Encoder { | ||
h := sha1.New() | ||
mw := io.MultiWriter(w, h) | ||
return &Encoder{mw, h} | ||
} | ||
|
||
func (e *Encoder) Encode(idx Index) error { | ||
// Get all the hashes in the memory index | ||
hashes := idx.Hashes() | ||
|
||
// Sort the hashes and build our index | ||
plumbing.HashesSort(hashes) | ||
hashToIndex := make(map[plumbing.Hash]uint32) | ||
hashFirstToCount := make(map[byte]uint32) | ||
for i, hash := range hashes { | ||
hashToIndex[hash] = uint32(i) | ||
hashFirstToCount[hash[0]]++ | ||
} | ||
|
||
// Find out if we will need large edge table | ||
chunkCount := 3 | ||
hasLargeEdges := false | ||
for i := 0; i < len(hashes); i++ { | ||
v, _ := idx.GetNodeByIndex(i) | ||
if len(v.ParentHashes) > 2 { | ||
hasLargeEdges = true | ||
chunkCount++ | ||
break | ||
} | ||
} | ||
|
||
// Find out if the bloom filters are present | ||
hasBloomFilters := false | ||
sparseBloomFilters := false | ||
bloomFiltersCount := 0 | ||
for i := 0; i < len(hashes); i++ { | ||
_, err := idx.GetBloomFilterByIndex(i) | ||
if err == nil { | ||
bloomFiltersCount++ | ||
} | ||
} | ||
if bloomFiltersCount > 0 { | ||
hasBloomFilters = true | ||
chunkCount++ | ||
if bloomFiltersCount < (len(hashes) * 4 / 3) { | ||
sparseBloomFilters = true | ||
chunkCount++ | ||
} | ||
} | ||
|
||
var fanoutOffset = uint64(20 + (chunkCount * 12)) | ||
var oidLookupOffset = fanoutOffset + 4*256 | ||
var commitDataOffset = oidLookupOffset + uint64(len(hashes))*20 | ||
var bloomOffset = commitDataOffset + uint64(len(hashes))*36 | ||
var sparseBloomOffset = bloomOffset + uint64(bloomFiltersCount)*640 | ||
var largeEdgeListOffset = bloomOffset | ||
var largeEdges []uint32 | ||
|
||
// Write header | ||
// TODO: Error handling | ||
e.Write(commitFileSignature) | ||
e.Write([]byte{1, 1, byte(chunkCount), 0}) | ||
|
||
// Write chunk headers | ||
e.Write(oidFanoutSignature) | ||
binary.WriteUint64(e, fanoutOffset) | ||
e.Write(oidLookupSignature) | ||
binary.WriteUint64(e, oidLookupOffset) | ||
e.Write(commitDataSignature) | ||
binary.WriteUint64(e, commitDataOffset) | ||
if hasBloomFilters { | ||
e.Write(experimentalBloomSignature) | ||
binary.WriteUint64(e, bloomOffset) | ||
if sparseBloomFilters { | ||
e.Write(experimentalSparseBloomSignature) | ||
binary.WriteUint64(e, sparseBloomOffset) | ||
largeEdgeListOffset = sparseBloomOffset + uint64(len(hashes)+7)/8 | ||
} else { | ||
largeEdgeListOffset = bloomOffset + 640*uint64(len(hashes)) | ||
} | ||
} | ||
if hasLargeEdges { | ||
e.Write(largeEdgeListSignature) | ||
binary.WriteUint64(e, largeEdgeListOffset) | ||
} | ||
e.Write([]byte{0, 0, 0, 0}) | ||
binary.WriteUint64(e, uint64(0)) | ||
|
||
// Write fanout | ||
var cumulative uint32 | ||
for i := 0; i <= 0xff; i++ { | ||
if err := binary.WriteUint32(e, hashFirstToCount[byte(i)]+cumulative); err != nil { | ||
return err | ||
} | ||
cumulative += hashFirstToCount[byte(i)] | ||
} | ||
|
||
// Write OID lookup | ||
for _, hash := range hashes { | ||
if _, err := e.Write(hash[:]); err != nil { | ||
return err | ||
} | ||
} | ||
|
||
// Write commit data | ||
for _, hash := range hashes { | ||
origIndex, _ := idx.GetIndexByHash(hash) | ||
commitData, _ := idx.GetNodeByIndex(origIndex) | ||
if _, err := e.Write(commitData.TreeHash[:]); err != nil { | ||
return err | ||
} | ||
|
||
if len(commitData.ParentHashes) == 0 { | ||
binary.WriteUint32(e, parentNone) | ||
binary.WriteUint32(e, parentNone) | ||
} else if len(commitData.ParentHashes) == 1 { | ||
binary.WriteUint32(e, hashToIndex[commitData.ParentHashes[0]]) | ||
binary.WriteUint32(e, parentNone) | ||
} else if len(commitData.ParentHashes) == 2 { | ||
binary.WriteUint32(e, hashToIndex[commitData.ParentHashes[0]]) | ||
binary.WriteUint32(e, hashToIndex[commitData.ParentHashes[1]]) | ||
} else if len(commitData.ParentHashes) > 2 { | ||
binary.WriteUint32(e, hashToIndex[commitData.ParentHashes[0]]) | ||
binary.WriteUint32(e, uint32(len(largeEdges))|parentOctopusMask) | ||
for _, parentHash := range commitData.ParentHashes[1:] { | ||
largeEdges = append(largeEdges, hashToIndex[parentHash]) | ||
} | ||
largeEdges[len(largeEdges)-1] |= parentLast | ||
} | ||
|
||
unixTime := uint64(commitData.When.Unix()) | ||
unixTime |= uint64(commitData.Generation) << 34 | ||
binary.WriteUint64(e, unixTime) | ||
} | ||
|
||
// Write bloom filters (experimental) | ||
if hasBloomFilters { | ||
var sparseBloomBitset []byte | ||
|
||
if sparseBloomFilters { | ||
sparseBloomBitset = bytes.Repeat([]byte{0xff}, (len(hashes)+7)/8) | ||
} | ||
|
||
for i, hash := range hashes { | ||
origIndex, _ := idx.GetIndexByHash(hash) | ||
if bloomFilter, err := idx.GetBloomFilterByIndex(origIndex); err != nil { | ||
if !sparseBloomFilters { | ||
for i := 0; i < 80; i++ { | ||
binary.WriteUint64(e, math.MaxUint64) | ||
} | ||
} else { | ||
sparseBloomBitset[i/8] &= ^(1 << uint(i%8)) | ||
} | ||
} else { | ||
e.Write(bloomFilter.Data()) | ||
} | ||
} | ||
|
||
if sparseBloomFilters { | ||
e.Write(sparseBloomBitset) | ||
} | ||
} | ||
|
||
// Write large edges if necessary | ||
if hasLargeEdges { | ||
for _, parent := range largeEdges { | ||
binary.WriteUint32(e, parent) | ||
} | ||
} | ||
|
||
// Write checksum | ||
if _, err := e.Write(e.hash.Sum(nil)[:20]); err != nil { | ||
return err | ||
} | ||
|
||
return nil | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is inserted here only to allow people to play with the feature.
Running a health check for all the repositories will also rebuild the commit graph files (http://gitea/admin?op=9). It is entirely possible to generate the commit graph file using the command line
git commit-graph write
tool instead. The bloom filter experiment is enabled by changingBuildCommitGraph(false)
toBuildCommitGraph(true)
in the above code. It will significantly increase the size of the commit graph files and the time to build it, but in many cases it will also significantly speed up hhistory queries on large repositories (unless I broke it :D).