swarm/storage: pyramid chunker re-write (#14382)

d558a595 · Zahoor Mohamed · Felix Lange · 3c865634 · d558a595 · d558a595
Commit d558a595 authored Sep 21, 2017 by Zahoor Mohamed Committed by Felix Lange Sep 21, 2017
12 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -30,3 +30,6 @@ build/_vendor/pkg
 # travis
 profile.tmp
 profile.cov
+# IdeaIDE
+.idea
--- a/swarm/network/depo.go
+++ b/swarm/network/depo.go
@@ -29,12 +29,12 @@ import (
 // Handler for storage/retrieval related protocol requests
 // implements the StorageHandler interface used by the bzz protocol
 type Depo struct {
-	hashfunc   storage.Hasher
+	hashfunc   storage.SwarmHasher
 	localStore storage.ChunkStore
 	netStore   storage.ChunkStore
 }
-func NewDepo(hash storage.Hasher, localStore, remoteStore storage.ChunkStore) *Depo {
+func NewDepo(hash storage.SwarmHasher, localStore, remoteStore storage.ChunkStore) *Depo {
 	return &Depo{
 		hashfunc:   hash,
 		localStore: localStore,

--- a/swarm/storage/chunker.go
+++ b/swarm/storage/chunker.go
@@ -20,9 +20,9 @@ import (
 	"encoding/binary"
 	"errors"
 	"fmt"
-	"hash"
 	"io"
 	"sync"
+	"time"
 )
 /*
@@ -50,14 +50,6 @@ data_{i} := size(subtree_{i}) || key_{j} || key_{j+1} .... || key_{j+n-1}
 The underlying hash function is configurable
 */
-const (
-	defaultHash = "SHA3"
-	// defaultHash = "BMTSHA3" // http://golang.org/pkg/hash/#Hash
-	// defaultHash           = "SHA256" // http://golang.org/pkg/hash/#Hash
-	defaultBranches int64 = 128
-	// hashSize     int64 = hasherfunc.New().Size() // hasher knows about its own length in bytes
-	// chunksize    int64 = branches * hashSize     // chunk is defined as this
-)
 /*
 Tree chunker is a concrete implementation of data chunking.
@@ -67,25 +59,19 @@ If all is well it is possible to implement this by simply composing readers so t
 The hashing itself does use extra copies and allocation though, since it does need it.
 */
-type ChunkerParams struct {
+var (
-	Branches int64
+	errAppendOppNotSuported = errors.New("Append operation not supported")
-	Hash     string
+	errOperationTimedOut = errors.New("operation timed out")
-}
+)
-func NewChunkerParams() *ChunkerParams {
-	return &ChunkerParams{
-		Branches: defaultBranches,
-		Hash:     defaultHash,
-	}
-}
 type TreeChunker struct {
 	branches int64
-	hashFunc Hasher
+	hashFunc SwarmHasher
 	// calculated
 	hashSize    int64 // self.hashFunc.New().Size()
 	chunkSize   int64 // hashSize* branches
-	workerCount int
+	workerCount int64 // the number of worker routines used
+	workerLock	sync.RWMutex // lock for the worker count
 }
 func NewTreeChunker(params *ChunkerParams) (self *TreeChunker) {
@@ -94,7 +80,8 @@ func NewTreeChunker(params *ChunkerParams) (self *TreeChunker) {
 	self.branches = params.Branches
 	self.hashSize = int64(self.hashFunc().Size())
 	self.chunkSize = self.hashSize * self.branches
-	self.workerCount = 1
+	self.workerCount = 0
 	return
 }
@@ -114,13 +101,31 @@ type hashJob struct {
 	parentWg *sync.WaitGroup
 }
-func (self *TreeChunker) Split(data io.Reader, size int64, chunkC chan *Chunk, swg, wwg *sync.WaitGroup) (Key, error) {
+func (self *TreeChunker) incrementWorkerCount() {
+	self.workerLock.Lock()
+	defer self.workerLock.Unlock()
+	self.workerCount += 1
+}
+func (self *TreeChunker) getWorkerCount() int64 {
+	self.workerLock.RLock()
+	defer self.workerLock.RUnlock()
+	return self.workerCount
+}
+func (self *TreeChunker) decrementWorkerCount() {
+	self.workerLock.Lock()
+	defer self.workerLock.Unlock()
+	self.workerCount -= 1
+}
+func (self *TreeChunker) Split(data io.Reader, size int64, chunkC chan *Chunk, swg, wwg *sync.WaitGroup) (Key, error) {
 	if self.chunkSize <= 0 {
 		panic("chunker must be initialised")
 	}
-	jobC := make(chan *hashJob, 2*processors)
+	jobC := make(chan *hashJob, 2*ChunkProcessors)
 	wg := &sync.WaitGroup{}
 	errC := make(chan error)
 	quitC := make(chan bool)
@@ -129,6 +134,8 @@ func (self *TreeChunker) Split(data io.Reader, size int64, chunkC chan *Chunk, s
 	if wwg != nil {
 		wwg.Add(1)
 	}
+	self.incrementWorkerCount()
 	go self.hashWorker(jobC, chunkC, errC, quitC, swg, wwg)
 	depth := 0
@@ -157,10 +164,15 @@ func (self *TreeChunker) Split(data io.Reader, size int64, chunkC chan *Chunk, s
 		close(errC)
 	}()
-	//TODO: add a timeout
-	if err := <-errC; err != nil {
+	defer close(quitC)
-		close(quitC)
+	select {
-		return nil, err
+	case err := <-errC:
+		if err != nil {
+			return nil, err
+		}
+	case <-time.NewTimer(splitTimeout).C:
+		return nil,errOperationTimedOut
 	}
 	return key, nil
@@ -168,6 +180,8 @@ func (self *TreeChunker) Split(data io.Reader, size int64, chunkC chan *Chunk, s
 func (self *TreeChunker) split(depth int, treeSize int64, key Key, data io.Reader, size int64, jobC chan *hashJob, chunkC chan *Chunk, errC chan error, quitC chan bool, parentWg, swg, wwg *sync.WaitGroup) {
+	//
 	for depth > 0 && size < treeSize {
 		treeSize /= self.branches
 		depth--
@@ -223,12 +237,15 @@ func (self *TreeChunker) split(depth int, treeSize int64, key Key, data io.Reade
 	// parentWg.Add(1)
 	// go func() {
 	childrenWg.Wait()
-	if len(jobC) > self.workerCount && self.workerCount < processors {
+	worker := self.getWorkerCount()
+	if int64(len(jobC)) > worker && worker < ChunkProcessors {
 		if wwg != nil {
 			wwg.Add(1)
 		}
-		self.workerCount++
+		self.incrementWorkerCount()
 		go self.hashWorker(jobC, chunkC, errC, quitC, swg, wwg)
 	}
 	select {
 	case jobC <- &hashJob{key, chunk, size, parentWg}:
@@ -237,6 +254,8 @@ func (self *TreeChunker) split(depth int, treeSize int64, key Key, data io.Reade
 }
 func (self *TreeChunker) hashWorker(jobC chan *hashJob, chunkC chan *Chunk, errC chan error, quitC chan bool, swg, wwg *sync.WaitGroup) {
+	defer self.decrementWorkerCount()
 	hasher := self.hashFunc()
 	if wwg != nil {
 		defer wwg.Done()
@@ -249,7 +268,6 @@ func (self *TreeChunker) hashWorker(jobC chan *hashJob, chunkC chan *Chunk, errC
 				return
 			}
 			// now we got the hashes in the chunk, then hash the chunks
-			hasher.Reset()
 			self.hashChunk(hasher, job, chunkC, swg)
 		case <-quitC:
 			return
@@ -260,9 +278,11 @@ func (self *TreeChunker) hashWorker(jobC chan *hashJob, chunkC chan *Chunk, errC
 // The treeChunkers own Hash hashes together
 // - the size (of the subtree encoded in the Chunk)
 // - the Chunk, ie. the contents read from the input reader
-func (self *TreeChunker) hashChunk(hasher hash.Hash, job *hashJob, chunkC chan *Chunk, swg *sync.WaitGroup) {
+func (self *TreeChunker) hashChunk(hasher SwarmHash, job *hashJob, chunkC chan *Chunk, swg *sync.WaitGroup) {
-	hasher.Write(job.chunk)
+	hasher.ResetWithLength(job.chunk[:8]) // 8 bytes of length
+	hasher.Write(job.chunk[8:])           // minus 8 []byte length
 	h := hasher.Sum(nil)
 	newChunk := &Chunk{
 		Key:   h,
 		SData: job.chunk,
@@ -285,6 +305,10 @@ func (self *TreeChunker) hashChunk(hasher hash.Hash, job *hashJob, chunkC chan *
 	}
 }
+func (self *TreeChunker) Append(key Key, data io.Reader, chunkC chan *Chunk, swg, wwg *sync.WaitGroup) (Key, error) {
+	return nil, errAppendOppNotSuported
+}
 // LazyChunkReader implements LazySectionReader
 type LazyChunkReader struct {
 	key       Key         // root key
@@ -298,7 +322,6 @@ type LazyChunkReader struct {
 // implements the Joiner interface
 func (self *TreeChunker) Join(key Key, chunkC chan *Chunk) LazySectionReader {
 	return &LazyChunkReader{
 		key:       key,
 		chunkC:    chunkC,

--- a/swarm/storage/chunker_test.go
+++ b/swarm/storage/chunker_test.go
--- a/swarm/storage/common_test.go
+++ b/swarm/storage/common_test.go
@@ -76,7 +76,7 @@ func testStore(m ChunkStore, l int64, branches int64, t *testing.T) {
 	}()
 	chunker := NewTreeChunker(&ChunkerParams{
 		Branches: branches,
-		Hash:     defaultHash,
+		Hash:     SHA3Hash,
 	})
 	swg := &sync.WaitGroup{}
 	key, _ := chunker.Split(rand.Reader, l, chunkC, swg, nil)

--- a/swarm/storage/dbstore.go
+++ b/swarm/storage/dbstore.go
@@ -72,12 +72,12 @@ type DbStore struct {
 	gcPos, gcStartPos []byte
 	gcArray           []*gcItem
-	hashfunc Hasher
+	hashfunc SwarmHasher
 	lock sync.Mutex
 }
-func NewDbStore(path string, hash Hasher, capacity uint64, radius int) (s *DbStore, err error) {
+func NewDbStore(path string, hash SwarmHasher, capacity uint64, radius int) (s *DbStore, err error) {
 	s = new(DbStore)
 	s.hashfunc = hash

--- a/swarm/storage/dbstore_test.go
+++ b/swarm/storage/dbstore_test.go
@@ -29,7 +29,7 @@ func initDbStore(t *testing.T) *DbStore {
 	if err != nil {
 		t.Fatal(err)
 	}
-	m, err := NewDbStore(dir, MakeHashFunc(defaultHash), defaultDbCapacity, defaultRadius)
+	m, err := NewDbStore(dir, MakeHashFunc(SHA3Hash), defaultDbCapacity, defaultRadius)
 	if err != nil {
 		t.Fatal("can't create store:", err)
 	}

--- a/swarm/storage/localstore.go
+++ b/swarm/storage/localstore.go
@@ -28,7 +28,7 @@ type LocalStore struct {
 }
 // This constructor uses MemStore and DbStore as components
-func NewLocalStore(hash Hasher, params *StoreParams) (*LocalStore, error) {
+func NewLocalStore(hash SwarmHasher, params *StoreParams) (*LocalStore, error) {
 	dbStore, err := NewDbStore(params.ChunkDbPath, hash, params.DbCapacity, params.Radius)
 	if err != nil {
 		return nil, err

--- a/swarm/storage/netstore.go
+++ b/swarm/storage/netstore.go
@@ -36,7 +36,7 @@ NetStore falls back to a backend (CloudStorage interface)
 implemented by bzz/network/forwarder. forwarder or IPFS or IPΞS
 */
 type NetStore struct {
-	hashfunc   Hasher
+	hashfunc   SwarmHasher
 	localStore *LocalStore
 	cloud      CloudStore
 }
@@ -69,7 +69,7 @@ func NewStoreParams(path string) (self *StoreParams) {
 // netstore contructor, takes path argument that is used to initialise dbStore,
 // the persistent (disk) storage component of LocalStore
 // the second argument is the hive, the connection/logistics manager for the node
-func NewNetStore(hash Hasher, lstore *LocalStore, cloud CloudStore, params *StoreParams) *NetStore {
+func NewNetStore(hash SwarmHasher, lstore *LocalStore, cloud CloudStore, params *StoreParams) *NetStore {
 	return &NetStore{
 		hashfunc:   hash,
 		localStore: lstore,

--- a/swarm/storage/pyramid.go
+++ b/swarm/storage/pyramid.go
--- a/swarm/storage/swarmhasher.go
+++ b/swarm/storage/swarmhasher.go
+// Copyright 2017 The go-ethereum Authors
+// This file is part of the go-ethereum library.
+//
+// The go-ethereum library is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// The go-ethereum library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
+package storage
+import (
+	"hash"
+)
+const (
+	BMTHash  = "BMT"
+	SHA3Hash = "SHA3" // http://golang.org/pkg/hash/#Hash
+)
+type SwarmHash interface {
+	hash.Hash
+	ResetWithLength([]byte)
+}
+type HashWithLength struct {
+	hash.Hash
+}
+func (self *HashWithLength) ResetWithLength(length []byte) {
+	self.Reset()
+	self.Write(length)
+}
--- a/swarm/storage/types.go
+++ b/swarm/storage/types.go
@@ -24,12 +24,13 @@ import (
 	"io"
 	"sync"
-	// "github.com/ethereum/go-ethereum/bmt"
+	"github.com/ethereum/go-ethereum/bmt"
 	"github.com/ethereum/go-ethereum/common"
 	"github.com/ethereum/go-ethereum/crypto/sha3"
 )
 type Hasher func() hash.Hash
+type SwarmHasher func() SwarmHash
 // Peer is the recorded as Source on the chunk
 // should probably not be here? but network should wrap chunk object
@@ -78,12 +79,18 @@ func IsZeroKey(key Key) bool {
 var ZeroKey = Key(common.Hash{}.Bytes())
-func MakeHashFunc(hash string) Hasher {
+func MakeHashFunc(hash string) SwarmHasher {
 	switch hash {
 	case "SHA256":
-		return crypto.SHA256.New
+		return func() SwarmHash { return &HashWithLength{crypto.SHA256.New()} }
 	case "SHA3":
-		return sha3.NewKeccak256
+		return func() SwarmHash { return &HashWithLength{sha3.NewKeccak256()} }
+	case "BMT":
+		return func() SwarmHash {
+			hasher := sha3.NewKeccak256
+			pool := bmt.NewTreePool(hasher, bmt.DefaultSegmentCount, bmt.DefaultPoolSize)
+			return bmt.New(pool)
+		}
 	}
 	return nil
 }
@@ -192,6 +199,13 @@ type Splitter interface {
 	   A closed error signals process completion at which point the key can be considered final if there were no errors.
 	*/
 	Split(io.Reader, int64, chan *Chunk, *sync.WaitGroup, *sync.WaitGroup) (Key, error)
+	/* This is the first step in making files mutable (not chunks)..
+	   Append allows adding more data chunks to the end of the already existsing file.
+	   The key for the root chunk is supplied to load the respective tree.
+	   Rest of the parameters behave like Split.
+	*/
+	Append(Key, io.Reader, chan *Chunk, *sync.WaitGroup, *sync.WaitGroup) (Key, error)
 }
 type Joiner interface {