Merge pull request #592 from fjl/disco-ping-pong

Discovery bonding protocol

Merge pull request #592 from fjl/disco-ping-pong
Discovery bonding protocol
fd171eff · Jeffrey Wilcke · 101ea1a1 · 76218959 · fd171eff · fd171eff
Commit fd171eff authored Apr 01, 2015 by Jeffrey Wilcke
6 changed files
--- a/eth/backend.go
+++ b/eth/backend.go
@@ -32,8 +32,8 @@ var (
 	defaultBootNodes = []*discover.Node{
 		// ETH/DEV cmd/bootnode
 		discover.MustParseNode("enode://09fbeec0d047e9a37e63f60f8618aa9df0e49271f3fadb2c070dc09e2099b95827b63a8b837c6fd01d0802d457dd83e3bd48bd3e6509f8209ed90dabbc30e3d3@52.16.188.185:30303"),
-		// ETH/DEV cpp-ethereum (poc-8.ethdev.com)
-		discover.MustParseNode("enode://4a44599974518ea5b0f14c31c4463692ac0329cb84851f3435e6d1b18ee4eae4aa495f846a0fa1219bd58035671881d44423876e57db2abd57254d0197da0ebe@5.1.83.226:30303"),
+		// ETH/DEV cpp-ethereum (poc-9.ethdev.com)
+		discover.MustParseNode("enode://487611428e6c99a11a9795a6abe7b529e81315ca6aad66e2a2fc76e3adf263faba0d35466c2f8f68d561dbefa8878d4df5f1f2ddb1fbeab7f42ffb8cd328bd4a@5.1.83.226:30303"),
 	}
 )


--- a/p2p/discover/node.go
+++ b/p2p/discover/node.go
@@ -13,6 +13,8 @@ import (
 	"net/url"
 	"strconv"
 	"strings"
+	"sync"
+	"sync/atomic"
 	"time"

 	"github.com/ethereum/go-ethereum/crypto"
@@ -30,7 +32,8 @@ type Node struct {
 	DiscPort int // UDP listening port for discovery protocol
 	TCPPort  int // TCP listening port for RLPx

-	active time.Time
+	// this must be set/read using atomic load and store.
+	activeStamp int64
 }

 func newNode(id NodeID, addr *net.UDPAddr) *Node {
@@ -39,7 +42,6 @@ func newNode(id NodeID, addr *net.UDPAddr) *Node {
 		IP:       addr.IP,
 		DiscPort: addr.Port,
 		TCPPort:  addr.Port,
-		active:   time.Now(),
 	}
 }

@@ -48,6 +50,20 @@ func (n *Node) isValid() bool {
 	return !n.IP.IsMulticast() && !n.IP.IsUnspecified() && n.TCPPort != 0 && n.DiscPort != 0
 }

+func (n *Node) bumpActive() {
+	stamp := time.Now().Unix()
+	atomic.StoreInt64(&n.activeStamp, stamp)
+}
+
+func (n *Node) active() time.Time {
+	stamp := atomic.LoadInt64(&n.activeStamp)
+	return time.Unix(stamp, 0)
+}
+
+func (n *Node) addr() *net.UDPAddr {
+	return &net.UDPAddr{IP: n.IP, Port: n.DiscPort}
+}
+
 // The string representation of a Node is a URL.
 // Please see ParseNode for a description of the format.
 func (n *Node) String() string {
@@ -304,3 +320,26 @@ func randomID(a NodeID, n int) (b NodeID) {
 	}
 	return b
 }
+
+// nodeDB stores all nodes we know about.
+type nodeDB struct {
+	mu   sync.RWMutex
+	byID map[NodeID]*Node
+}
+
+func (db *nodeDB) get(id NodeID) *Node {
+	db.mu.RLock()
+	defer db.mu.RUnlock()
+	return db.byID[id]
+}
+
+func (db *nodeDB) add(id NodeID, addr *net.UDPAddr, tcpPort uint16) *Node {
+	db.mu.Lock()
+	defer db.mu.Unlock()
+	if db.byID == nil {
+		db.byID = make(map[NodeID]*Node)
+	}
+	n := &Node{ID: id, IP: addr.IP, DiscPort: addr.Port, TCPPort: int(tcpPort)}
+	db.byID[n.ID] = n
+	return n
+}
--- a/p2p/discover/table.go
+++ b/p2p/discover/table.go
@@ -14,9 +14,10 @@ import (
 )

 const (
-	alpha      = 3              // Kademlia concurrency factor
-	bucketSize = 16             // Kademlia bucket size
-	nBuckets   = nodeIDBits + 1 // Number of buckets
+	alpha               = 3              // Kademlia concurrency factor
+	bucketSize          = 16             // Kademlia bucket size
+	nBuckets            = nodeIDBits + 1 // Number of buckets
+	maxBondingPingPongs = 10
 )

 type Table struct {
@@ -24,27 +25,50 @@ type Table struct {
 	buckets [nBuckets]*bucket // index of known nodes by distance
 	nursery []*Node           // bootstrap nodes

+	bondmu    sync.Mutex
+	bonding   map[NodeID]*bondproc
+	bondslots chan struct{} // limits total number of active bonding processes
+
 	net  transport
 	self *Node // metadata of the local node
+	db   *nodeDB
+}
+
+type bondproc struct {
+	err  error
+	n    *Node
+	done chan struct{}
 }

 // transport is implemented by the UDP transport.
 // it is an interface so we can test without opening lots of UDP
 // sockets and without generating a private key.
 type transport interface {
-	ping(*Node) error
-	findnode(e *Node, target NodeID) ([]*Node, error)
+	ping(NodeID, *net.UDPAddr) error
+	waitping(NodeID) error
+	findnode(toid NodeID, addr *net.UDPAddr, target NodeID) ([]*Node, error)
 	close()
 }

 // bucket contains nodes, ordered by their last activity.
+// the entry that was most recently active is the last element
+// in entries.
 type bucket struct {
 	lastLookup time.Time
 	entries    []*Node
 }

 func newTable(t transport, ourID NodeID, ourAddr *net.UDPAddr) *Table {
-	tab := &Table{net: t, self: newNode(ourID, ourAddr)}
+	tab := &Table{
+		net:       t,
+		db:        new(nodeDB),
+		self:      newNode(ourID, ourAddr),
+		bonding:   make(map[NodeID]*bondproc),
+		bondslots: make(chan struct{}, maxBondingPingPongs),
+	}
+	for i := 0; i < cap(tab.bondslots); i++ {
+		tab.bondslots <- struct{}{}
+	}
 	for i := range tab.buckets {
 		tab.buckets[i] = new(bucket)
 	}
@@ -107,8 +131,8 @@ func (tab *Table) Lookup(target NodeID) []*Node {
 				asked[n.ID] = true
 				pendingQueries++
 				go func() {
-					result, _ := tab.net.findnode(n, target)
-					reply <- result
+					r, _ := tab.net.findnode(n.ID, n.addr(), target)
+					reply <- tab.bondall(r)
 				}()
 			}
 		}
@@ -116,13 +140,11 @@ func (tab *Table) Lookup(target NodeID) []*Node {
 			// we have asked all closest nodes, stop the search
 			break
 		}
-
 		// wait for the next reply
 		for _, n := range <-reply {
-			cn := n
-			if !seen[n.ID] {
+			if n != nil && !seen[n.ID] {
 				seen[n.ID] = true
-				result.push(cn, bucketSize)
+				result.push(n, bucketSize)
 			}
 		}
 		pendingQueries--
@@ -145,8 +167,9 @@ func (tab *Table) refresh() {
 	result := tab.Lookup(randomID(tab.self.ID, ld))
 	if len(result) == 0 {
 		// bootstrap the table with a self lookup
+		all := tab.bondall(tab.nursery)
 		tab.mutex.Lock()
-		tab.add(tab.nursery)
+		tab.add(all)
 		tab.mutex.Unlock()
 		tab.Lookup(tab.self.ID)
 		// TODO: the Kademlia paper says that we're supposed to perform
@@ -176,45 +199,105 @@ func (tab *Table) len() (n int) {
 	return n
 }

-// bumpOrAdd updates the activity timestamp for the given node and
-// attempts to insert the node into a bucket. The returned Node might
-// not be part of the table. The caller must hold tab.mutex.
-func (tab *Table) bumpOrAdd(node NodeID, from *net.UDPAddr) (n *Node) {
-	b := tab.buckets[logdist(tab.self.ID, node)]
-	if n = b.bump(node); n == nil {
-		n = newNode(node, from)
-		if len(b.entries) == bucketSize {
-			tab.pingReplace(n, b)
-		} else {
-			b.entries = append(b.entries, n)
+// bondall bonds with all given nodes concurrently and returns
+// those nodes for which bonding has probably succeeded.
+func (tab *Table) bondall(nodes []*Node) (result []*Node) {
+	rc := make(chan *Node, len(nodes))
+	for i := range nodes {
+		go func(n *Node) {
+			nn, _ := tab.bond(false, n.ID, n.addr(), uint16(n.TCPPort))
+			rc <- nn
+		}(nodes[i])
+	}
+	for _ = range nodes {
+		if n := <-rc; n != nil {
+			result = append(result, n)
 		}
 	}
-	return n
+	return result
 }

-func (tab *Table) pingReplace(n *Node, b *bucket) {
-	old := b.entries[bucketSize-1]
-	go func() {
-		if err := tab.net.ping(old); err == nil {
-			// it responded, we don't need to replace it.
-			return
+// bond ensures the local node has a bond with the given remote node.
+// It also attempts to insert the node into the table if bonding succeeds.
+// The caller must not hold tab.mutex.
+//
+// A bond is must be established before sending findnode requests.
+// Both sides must have completed a ping/pong exchange for a bond to
+// exist. The total number of active bonding processes is limited in
+// order to restrain network use.
+//
+// bond is meant to operate idempotently in that bonding with a remote
+// node which still remembers a previously established bond will work.
+// The remote node will simply not send a ping back, causing waitping
+// to time out.
+//
+// If pinged is true, the remote node has just pinged us and one half
+// of the process can be skipped.
+func (tab *Table) bond(pinged bool, id NodeID, addr *net.UDPAddr, tcpPort uint16) (*Node, error) {
+	var n *Node
+	if n = tab.db.get(id); n == nil {
+		tab.bondmu.Lock()
+		w := tab.bonding[id]
+		if w != nil {
+			// Wait for an existing bonding process to complete.
+			tab.bondmu.Unlock()
+			<-w.done
+		} else {
+			// Register a new bonding process.
+			w = &bondproc{done: make(chan struct{})}
+			tab.bonding[id] = w
+			tab.bondmu.Unlock()
+			// Do the ping/pong. The result goes into w.
+			tab.pingpong(w, pinged, id, addr, tcpPort)
+			// Unregister the process after it's done.
+			tab.bondmu.Lock()
+			delete(tab.bonding, id)
+			tab.bondmu.Unlock()
 		}
-		// it didn't respond, replace the node if it is still the oldest node.
-		tab.mutex.Lock()
-		if len(b.entries) > 0 && b.entries[len(b.entries)-1] == old {
-			// slide down other entries and put the new one in front.
-			// TODO: insert in correct position to keep the order
-			copy(b.entries[1:], b.entries)
-			b.entries[0] = n
+		n = w.n
+		if w.err != nil {
+			return nil, w.err
 		}
-		tab.mutex.Unlock()
-	}()
+	}
+	tab.mutex.Lock()
+	defer tab.mutex.Unlock()
+	if b := tab.buckets[logdist(tab.self.ID, n.ID)]; !b.bump(n) {
+		tab.pingreplace(n, b)
+	}
+	return n, nil
+}
+
+func (tab *Table) pingpong(w *bondproc, pinged bool, id NodeID, addr *net.UDPAddr, tcpPort uint16) {
+	<-tab.bondslots
+	defer func() { tab.bondslots <- struct{}{} }()
+	if w.err = tab.net.ping(id, addr); w.err != nil {
+		close(w.done)
+		return
+	}
+	if !pinged {
+		// Give the remote node a chance to ping us before we start
+		// sending findnode requests. If they still remember us,
+		// waitping will simply time out.
+		tab.net.waitping(id)
+	}
+	w.n = tab.db.add(id, addr, tcpPort)
+	close(w.done)
 }

-// bump updates the activity timestamp for the given node.
-// The caller must hold tab.mutex.
-func (tab *Table) bump(node NodeID) {
-	tab.buckets[logdist(tab.self.ID, node)].bump(node)
+func (tab *Table) pingreplace(new *Node, b *bucket) {
+	if len(b.entries) == bucketSize {
+		oldest := b.entries[bucketSize-1]
+		if err := tab.net.ping(oldest.ID, oldest.addr()); err == nil {
+			// The node responded, we don't need to replace it.
+			return
+		}
+	} else {
+		// Add a slot at the end so the last entry doesn't
+		// fall off when adding the new node.
+		b.entries = append(b.entries, nil)
+	}
+	copy(b.entries[1:], b.entries)
+	b.entries[0] = new
 }

 // add puts the entries into the table if their corresponding
@@ -240,17 +323,17 @@ outer:
 	}
 }

-func (b *bucket) bump(id NodeID) *Node {
-	for i, n := range b.entries {
-		if n.ID == id {
-			n.active = time.Now()
+func (b *bucket) bump(n *Node) bool {
+	for i := range b.entries {
+		if b.entries[i].ID == n.ID {
+			n.bumpActive()
 			// move it to the front
-			copy(b.entries[1:], b.entries[:i+1])
+			copy(b.entries[1:], b.entries[:i])
 			b.entries[0] = n
-			return n
+			return true
 		}
 	}
-	return nil
+	return false
 }

 // nodesByDistance is a list of nodes, ordered by

--- a/p2p/discover/table_test.go
+++ b/p2p/discover/table_test.go
@@ -2,78 +2,109 @@ package discover

 import (
 	"crypto/ecdsa"
-	"errors"
 	"fmt"
 	"math/rand"
 	"net"
 	"reflect"
 	"testing"
 	"testing/quick"
-	"time"

 	"github.com/ethereum/go-ethereum/crypto"
 )

-func TestTable_bumpOrAddBucketAssign(t *testing.T) {
-	tab := newTable(nil, NodeID{}, &net.UDPAddr{})
-	for i := 1; i < len(tab.buckets); i++ {
-		tab.bumpOrAdd(randomID(tab.self.ID, i), &net.UDPAddr{})
-	}
-	for i, b := range tab.buckets {
-		if i > 0 && len(b.entries) != 1 {
-			t.Errorf("bucket %d has %d entries, want 1", i, len(b.entries))
+func TestTable_pingReplace(t *testing.T) {
+	doit := func(newNodeIsResponding, lastInBucketIsResponding bool) {
+		transport := newPingRecorder()
+		tab := newTable(transport, NodeID{}, &net.UDPAddr{})
+		last := fillBucket(tab, 200)
+		pingSender := randomID(tab.self.ID, 200)
+
+		// this gotPing should replace the last node
+		// if the last node is not responding.
+		transport.responding[last.ID] = lastInBucketIsResponding
+		transport.responding[pingSender] = newNodeIsResponding
+		tab.bond(true, pingSender, &net.UDPAddr{}, 0)
+
+		// first ping goes to sender (bonding pingback)
+		if !transport.pinged[pingSender] {
+			t.Error("table did not ping back sender")
+		}
+		if newNodeIsResponding {
+			// second ping goes to oldest node in bucket
+			// to see whether it is still alive.
+			if !transport.pinged[last.ID] {
+				t.Error("table did not ping last node in bucket")
+			}
 		}
-	}
-}
-
-func TestTable_bumpOrAddPingReplace(t *testing.T) {
-	pingC := make(pingC)
-	tab := newTable(pingC, NodeID{}, &net.UDPAddr{})
-	last := fillBucket(tab, 200)

-	// this bumpOrAdd should not replace the last node
-	// because the node replies to ping.
-	new := tab.bumpOrAdd(randomID(tab.self.ID, 200), &net.UDPAddr{})
+		tab.mutex.Lock()
+		defer tab.mutex.Unlock()
+		if l := len(tab.buckets[200].entries); l != bucketSize {
+			t.Errorf("wrong bucket size after gotPing: got %d, want %d", bucketSize, l)
+		}

-	pinged := <-pingC
-	if pinged != last.ID {
-		t.Fatalf("pinged wrong node: %v\nwant %v", pinged, last.ID)
+		if lastInBucketIsResponding || !newNodeIsResponding {
+			if !contains(tab.buckets[200].entries, last.ID) {
+				t.Error("last entry was removed")
+			}
+			if contains(tab.buckets[200].entries, pingSender) {
+				t.Error("new entry was added")
+			}
+		} else {
+			if contains(tab.buckets[200].entries, last.ID) {
+				t.Error("last entry was not removed")
+			}
+			if !contains(tab.buckets[200].entries, pingSender) {
+				t.Error("new entry was not added")
+			}
+		}
 	}

-	tab.mutex.Lock()
-	defer tab.mutex.Unlock()
-	if l := len(tab.buckets[200].entries); l != bucketSize {
-		t.Errorf("wrong bucket size after bumpOrAdd: got %d, want %d", bucketSize, l)
-	}
-	if !contains(tab.buckets[200].entries, last.ID) {
-		t.Error("last entry was removed")
-	}
-	if contains(tab.buckets[200].entries, new.ID) {
-		t.Error("new entry was added")
-	}
+	doit(true, true)
+	doit(false, true)
+	doit(false, true)
+	doit(false, false)
 }

-func TestTable_bumpOrAddPingTimeout(t *testing.T) {
-	tab := newTable(pingC(nil), NodeID{}, &net.UDPAddr{})
-	last := fillBucket(tab, 200)
-
-	// this bumpOrAdd should replace the last node
-	// because the node does not reply to ping.
-	new := tab.bumpOrAdd(randomID(tab.self.ID, 200), &net.UDPAddr{})
-
-	// wait for async bucket update. damn. this needs to go away.
-	time.Sleep(2 * time.Millisecond)
-
-	tab.mutex.Lock()
-	defer tab.mutex.Unlock()
-	if l := len(tab.buckets[200].entries); l != bucketSize {
-		t.Errorf("wrong bucket size after bumpOrAdd: got %d, want %d", bucketSize, l)
+func TestBucket_bumpNoDuplicates(t *testing.T) {
+	t.Parallel()
+	cfg := &quick.Config{
+		MaxCount: 1000,
+		Rand:     quickrand,
+		Values: func(args []reflect.Value, rand *rand.Rand) {
+			// generate a random list of nodes. this will be the content of the bucket.
+			n := rand.Intn(bucketSize-1) + 1
+			nodes := make([]*Node, n)
+			for i := range nodes {
+				nodes[i] = &Node{ID: randomID(NodeID{}, 200)}
+			}
+			args[0] = reflect.ValueOf(nodes)
+			// generate random bump positions.
+			bumps := make([]int, rand.Intn(100))
+			for i := range bumps {
+				bumps[i] = rand.Intn(len(nodes))
+			}
+			args[1] = reflect.ValueOf(bumps)
+		},
 	}
-	if contains(tab.buckets[200].entries, last.ID) {
-		t.Error("last entry was not removed")
+
+	prop := func(nodes []*Node, bumps []int) (ok bool) {
+		b := &bucket{entries: make([]*Node, len(nodes))}
+		copy(b.entries, nodes)
+		for i, pos := range bumps {
+			b.bump(b.entries[pos])
+			if hasDuplicates(b.entries) {
+				t.Logf("bucket has duplicates after %d/%d bumps:", i+1, len(bumps))
+				for _, n := range b.entries {
+					t.Logf("  %p", n)
+				}
+				return false
+			}
+		}
+		return true
 	}
-	if !contains(tab.buckets[200].entries, new.ID) {
-		t.Error("new entry was not added")
+	if err := quick.Check(prop, cfg); err != nil {
+		t.Error(err)
 	}
 }

@@ -85,44 +116,27 @@ func fillBucket(tab *Table, ld int) (last *Node) {
 	return b.entries[bucketSize-1]
 }

-type pingC chan NodeID
+type pingRecorder struct{ responding, pinged map[NodeID]bool }

-func (t pingC) findnode(n *Node, target NodeID) ([]*Node, error) {
+func newPingRecorder() *pingRecorder {
+	return &pingRecorder{make(map[NodeID]bool), make(map[NodeID]bool)}
+}
+
+func (t *pingRecorder) findnode(toid NodeID, toaddr *net.UDPAddr, target NodeID) ([]*Node, error) {
 	panic("findnode called on pingRecorder")
 }
-func (t pingC) close() {
+func (t *pingRecorder) close() {
 	panic("close called on pingRecorder")
 }
-func (t pingC) ping(n *Node) error {
-	if t == nil {
-		return errTimeout
-	}
-	t <- n.ID
-	return nil
+func (t *pingRecorder) waitping(from NodeID) error {
+	return nil // remote always pings
 }
-
-func TestTable_bump(t *testing.T) {
-	tab := newTable(nil, NodeID{}, &net.UDPAddr{})
-
-	// add an old entry and two recent ones
-	oldactive := time.Now().Add(-2 * time.Minute)
-	old := &Node{ID: randomID(tab.self.ID, 200), active: oldactive}
-	others := []*Node{
-		&Node{ID: randomID(tab.self.ID, 200), active: time.Now()},
-		&Node{ID: randomID(tab.self.ID, 200), active: time.Now()},
-	}
-	tab.add(append(others, old))
-	if tab.buckets[200].entries[0] == old {
-		t.Fatal("old entry is at front of bucket")
-	}
-
-	// bumping the old entry should move it to the front
-	tab.bump(old.ID)
-	if old.active == oldactive {
-		t.Error("activity timestamp not updated")
-	}
-	if tab.buckets[200].entries[0] != old {
-		t.Errorf("bumped entry did not move to the front of bucket")
+func (t *pingRecorder) ping(toid NodeID, toaddr *net.UDPAddr) error {
+	t.pinged[toid] = true
+	if t.responding[toid] {
+		return nil
+	} else {
+		return errTimeout
 	}
 }

@@ -210,7 +224,7 @@ func TestTable_Lookup(t *testing.T) {
 		t.Fatalf("lookup on empty table returned %d results: %#v", len(results), results)
 	}
 	// seed table with initial node (otherwise lookup will terminate immediately)
-	tab.bumpOrAdd(randomID(target, 200), &net.UDPAddr{Port: 200})
+	tab.add([]*Node{newNode(randomID(target, 200), &net.UDPAddr{Port: 200})})

 	results := tab.Lookup(target)
 	t.Logf("results:")
@@ -238,16 +252,16 @@ type findnodeOracle struct {
 	target NodeID
 }

-func (t findnodeOracle) findnode(n *Node, target NodeID) ([]*Node, error) {
-	t.t.Logf("findnode query at dist %d", n.DiscPort)
+func (t findnodeOracle) findnode(toid NodeID, toaddr *net.UDPAddr, target NodeID) ([]*Node, error) {
+	t.t.Logf("findnode query at dist %d", toaddr.Port)
 	// current log distance is encoded in port number
 	var result []*Node
-	switch n.DiscPort {
+	switch toaddr.Port {
 	case 0:
 		panic("query to node at distance 0")
 	default:
 		// TODO: add more randomness to distances
-		next := n.DiscPort - 1
+		next := toaddr.Port - 1
 		for i := 0; i < bucketSize; i++ {
 			result = append(result, &Node{ID: randomID(t.target, next), DiscPort: next})
 		}
@@ -255,11 +269,9 @@ func (t findnodeOracle) findnode(n *Node, target NodeID) ([]*Node, error) {
 	return result, nil
 }

-func (t findnodeOracle) close() {}
-
-func (t findnodeOracle) ping(n *Node) error {
-	return errors.New("ping is not supported by this transport")
-}
+func (t findnodeOracle) close()                                      {}
+func (t findnodeOracle) waitping(from NodeID) error                  { return nil }
+func (t findnodeOracle) ping(toid NodeID, toaddr *net.UDPAddr) error { return nil }

 func hasDuplicates(slice []*Node) bool {
 	seen := make(map[NodeID]bool)

--- a/p2p/discover/udp.go
+++ b/p2p/discover/udp.go
--- a/p2p/discover/udp_test.go
+++ b/p2p/discover/udp_test.go