p2p/discover, p2p/enode: rework endpoint proof handling, packet logging (#18963)

This change resolves multiple issues around handling of endpoint proofs. The proof is now done separately for each IP and completing the proof requires a matching ping hash. Also remove waitping because it's equivalent to sleep. waitping was slightly more efficient, but that may cause issues with findnode if packets are reordered and the remote end sees findnode before pong. Logging of received packets was hitherto done after handling the packet, which meant that sent replies were logged before the packet that generated them. This change splits up packet handling into 'preverify' and 'handle'. The error from 'preverify' is logged, but 'handle' happens after the message is logged. This fixes the order. Packet logs now contain the node ID.

p2p/discover, p2p/enode: rework endpoint proof handling, packet logging (#18963)
This change resolves multiple issues around handling of endpoint proofs. The proof is now done separately for each IP and completing the proof requires a matching ping hash. Also remove waitping because it's equivalent to sleep. waitping was slightly more efficient, but that may cause issues with findnode if packets are reordered and the remote end sees findnode before pong. Logging of received packets was hitherto done after handling the packet, which meant that sent replies were logged before the packet that generated them. This change splits up packet handling into 'preverify' and 'handle'. The error from 'preverify' is logged, but 'handle' happens after the message is logged. This fixes the order. Packet logs now contain the node ID.
f0c6f921 · Felix Lange · GitHub · 74c38902 · f0c6f921 · f0c6f921
Unverified Commit f0c6f921 authored Jan 29, 2019 by Felix Lange Committed by GitHub Jan 29, 2019
8 changed files
--- a/p2p/discover/node.go
+++ b/p2p/discover/node.go
@@ -33,7 +33,8 @@ import (
 // The fields of Node may not be modified.
 type node struct {
 	enode.Node
-	addedAt time.Time // time when the node was added to the table
+	addedAt        time.Time // time when the node was added to the table
+	livenessChecks uint      // how often liveness was checked
 }
 type encPubkey [64]byte

--- a/p2p/discover/table.go
+++ b/p2p/discover/table.go
@@ -75,8 +75,10 @@ type Table struct {
 	net        transport
 	refreshReq chan chan struct{}
 	initDone   chan struct{}
-	closeReq   chan struct{}
-	closed     chan struct{}
+	closeOnce sync.Once
+	closeReq  chan struct{}
+	closed    chan struct{}
 	nodeAddedHook func(*node) // for testing
 }
@@ -180,16 +182,14 @@ func (tab *Table) ReadRandomNodes(buf []*enode.Node) (n int) {
 // Close terminates the network listener and flushes the node database.
 func (tab *Table) Close() {
-	if tab.net != nil {
+	tab.closeOnce.Do(func() {
-		tab.net.close()
+		if tab.net != nil {
-	}
+			tab.net.close()
+		}
-	select {
+		// Wait for loop to end.
-	case <-tab.closed:
+		close(tab.closeReq)
-		// already closed.
+		<-tab.closed
-	case tab.closeReq <- struct{}{}:
+	})
-		<-tab.closed // wait for refreshLoop to end.
-	}
 }
 // setFallbackNodes sets the initial points of contact. These nodes
@@ -290,12 +290,16 @@ func (tab *Table) lookup(targetKey encPubkey, refreshIfEmpty bool) []*node {
 			// we have asked all closest nodes, stop the search
 			break
 		}
-		// wait for the next reply
+		select {
-		for _, n := range <-reply {
+		case nodes := <-reply:
-			if n != nil && !seen[n.ID()] {
+			for _, n := range nodes {
-				seen[n.ID()] = true
+				if n != nil && !seen[n.ID()] {
-				result.push(n, bucketSize)
+					seen[n.ID()] = true
+					result.push(n, bucketSize)
+				}
 			}
+		case <-tab.closeReq:
+			return nil // shutdown, no need to continue.
 		}
 		pendingQueries--
 	}
@@ -303,18 +307,22 @@ func (tab *Table) lookup(targetKey encPubkey, refreshIfEmpty bool) []*node {
 }
 func (tab *Table) findnode(n *node, targetKey encPubkey, reply chan<- []*node) {
-	fails := tab.db.FindFails(n.ID())
+	fails := tab.db.FindFails(n.ID(), n.IP())
 	r, err := tab.net.findnode(n.ID(), n.addr(), targetKey)
-	if err != nil || len(r) == 0 {
+	if err == errClosed {
+		// Avoid recording failures on shutdown.
+		reply <- nil
+		return
+	} else if err != nil || len(r) == 0 {
 		fails++
-		tab.db.UpdateFindFails(n.ID(), fails)
+		tab.db.UpdateFindFails(n.ID(), n.IP(), fails)
 		log.Trace("Findnode failed", "id", n.ID(), "failcount", fails, "err", err)
 		if fails >= maxFindnodeFailures {
 			log.Trace("Too many findnode failures, dropping", "id", n.ID(), "failcount", fails)
 			tab.delete(n)
 		}
 	} else if fails > 0 {
-		tab.db.UpdateFindFails(n.ID(), fails-1)
+		tab.db.UpdateFindFails(n.ID(), n.IP(), fails-1)
 	}
 	// Grab as many nodes as possible. Some of them might not be alive anymore, but we'll
@@ -329,7 +337,7 @@ func (tab *Table) refresh() <-chan struct{} {
 	done := make(chan struct{})
 	select {
 	case tab.refreshReq <- done:
-	case <-tab.closed:
+	case <-tab.closeReq:
 		close(done)
 	}
 	return done
@@ -433,7 +441,7 @@ func (tab *Table) loadSeedNodes() {
 	seeds = append(seeds, tab.nursery...)
 	for i := range seeds {
 		seed := seeds[i]
-		age := log.Lazy{Fn: func() interface{} { return time.Since(tab.db.LastPongReceived(seed.ID())) }}
+		age := log.Lazy{Fn: func() interface{} { return time.Since(tab.db.LastPongReceived(seed.ID(), seed.IP())) }}
 		log.Trace("Found seed node in database", "id", seed.ID(), "addr", seed.addr(), "age", age)
 		tab.add(seed)
 	}
@@ -458,16 +466,17 @@ func (tab *Table) doRevalidate(done chan<- struct{}) {
 	b := tab.buckets[bi]
 	if err == nil {
 		// The node responded, move it to the front.
-		log.Debug("Revalidated node", "b", bi, "id", last.ID())
+		last.livenessChecks++
+		log.Debug("Revalidated node", "b", bi, "id", last.ID(), "checks", last.livenessChecks)
 		b.bump(last)
 		return
 	}
 	// No reply received, pick a replacement or delete the node if there aren't
 	// any replacements.
 	if r := tab.replace(b, last); r != nil {
-		log.Debug("Replaced dead node", "b", bi, "id", last.ID(), "ip", last.IP(), "r", r.ID(), "rip", r.IP())
+		log.Debug("Replaced dead node", "b", bi, "id", last.ID(), "ip", last.IP(), "checks", last.livenessChecks, "r", r.ID(), "rip", r.IP())
 	} else {
-		log.Debug("Removed dead node", "b", bi, "id", last.ID(), "ip", last.IP())
+		log.Debug("Removed dead node", "b", bi, "id", last.ID(), "ip", last.IP(), "checks", last.livenessChecks)
 	}
 }
@@ -502,7 +511,7 @@ func (tab *Table) copyLiveNodes() {
 	now := time.Now()
 	for _, b := range &tab.buckets {
 		for _, n := range b.entries {
-			if now.Sub(n.addedAt) >= seedMinTableTime {
+			if n.livenessChecks > 0 && now.Sub(n.addedAt) >= seedMinTableTime {
 				tab.db.UpdateNode(unwrapNode(n))
 			}
 		}
@@ -518,7 +527,9 @@ func (tab *Table) closest(target enode.ID, nresults int) *nodesByDistance {
 	close := &nodesByDistance{target: target}
 	for _, b := range &tab.buckets {
 		for _, n := range b.entries {
-			close.push(n, nresults)
+			if n.livenessChecks > 0 {
+				close.push(n, nresults)
+			}
 		}
 	}
 	return close
@@ -572,23 +583,6 @@ func (tab *Table) addThroughPing(n *node) {
 	tab.add(n)
 }
-// stuff adds nodes the table to the end of their corresponding bucket
-// if the bucket is not full. The caller must not hold tab.mutex.
-func (tab *Table) stuff(nodes []*node) {
-	tab.mutex.Lock()
-	defer tab.mutex.Unlock()
-	for _, n := range nodes {
-		if n.ID() == tab.self().ID() {
-			continue // don't add self
-		}
-		b := tab.bucket(n.ID())
-		if len(b.entries) < bucketSize {
-			tab.bumpOrAdd(b, n)
-		}
-	}
-}
 // delete removes an entry from the node table. It is used to evacuate dead nodes.
 func (tab *Table) delete(node *node) {
 	tab.mutex.Lock()

--- a/p2p/discover/table_test.go
+++ b/p2p/discover/table_test.go
@@ -50,8 +50,8 @@ func TestTable_pingReplace(t *testing.T) {
 func testPingReplace(t *testing.T, newNodeIsResponding, lastInBucketIsResponding bool) {
 	transport := newPingRecorder()
 	tab, db := newTestTable(transport)
-	defer tab.Close()
 	defer db.Close()
+	defer tab.Close()
 	<-tab.initDone
@@ -137,8 +137,8 @@ func TestBucket_bumpNoDuplicates(t *testing.T) {
 func TestTable_IPLimit(t *testing.T) {
 	transport := newPingRecorder()
 	tab, db := newTestTable(transport)
-	defer tab.Close()
 	defer db.Close()
+	defer tab.Close()
 	for i := 0; i < tableIPLimit+1; i++ {
 		n := nodeAtDistance(tab.self().ID(), i, net.IP{172, 0, 1, byte(i)})
@@ -153,8 +153,8 @@ func TestTable_IPLimit(t *testing.T) {
 func TestTable_BucketIPLimit(t *testing.T) {
 	transport := newPingRecorder()
 	tab, db := newTestTable(transport)
-	defer tab.Close()
 	defer db.Close()
+	defer tab.Close()
 	d := 3
 	for i := 0; i < bucketIPLimit+1; i++ {
@@ -173,9 +173,9 @@ func TestTable_closest(t *testing.T) {
 		// for any node table, Target and N
 		transport := newPingRecorder()
 		tab, db := newTestTable(transport)
-		defer tab.Close()
 		defer db.Close()
-		tab.stuff(test.All)
+		defer tab.Close()
+		fillTable(tab, test.All)
 		// check that closest(Target, N) returns nodes
 		result := tab.closest(test.Target, test.N).entries
@@ -234,13 +234,13 @@ func TestTable_ReadRandomNodesGetAll(t *testing.T) {
 	test := func(buf []*enode.Node) bool {
 		transport := newPingRecorder()
 		tab, db := newTestTable(transport)
-		defer tab.Close()
 		defer db.Close()
+		defer tab.Close()
 		<-tab.initDone
 		for i := 0; i < len(buf); i++ {
 			ld := cfg.Rand.Intn(len(tab.buckets))
-			tab.stuff([]*node{nodeAtDistance(tab.self().ID(), ld, intIP(ld))})
+			fillTable(tab, []*node{nodeAtDistance(tab.self().ID(), ld, intIP(ld))})
 		}
 		gotN := tab.ReadRandomNodes(buf)
 		if gotN != tab.len() {
@@ -272,16 +272,19 @@ func (*closeTest) Generate(rand *rand.Rand, size int) reflect.Value {
 		N:      rand.Intn(bucketSize),
 	}
 	for _, id := range gen([]enode.ID{}, rand).([]enode.ID) {
-		n := enode.SignNull(new(enr.Record), id)
+		r := new(enr.Record)
-		t.All = append(t.All, wrapNode(n))
+		r.Set(enr.IP(genIP(rand)))
+		n := wrapNode(enode.SignNull(r, id))
+		n.livenessChecks = 1
+		t.All = append(t.All, n)
 	}
 	return reflect.ValueOf(t)
 }
 func TestTable_Lookup(t *testing.T) {
 	tab, db := newTestTable(lookupTestnet)
-	defer tab.Close()
 	defer db.Close()
+	defer tab.Close()
 	// lookup on empty table returns no nodes
 	if results := tab.lookup(lookupTestnet.target, false); len(results) > 0 {
@@ -289,8 +292,9 @@ func TestTable_Lookup(t *testing.T) {
 	}
 	// seed table with initial node (otherwise lookup will terminate immediately)
 	seedKey, _ := decodePubkey(lookupTestnet.dists[256][0])
-	seed := wrapNode(enode.NewV4(seedKey, net.IP{}, 0, 256))
+	seed := wrapNode(enode.NewV4(seedKey, net.IP{127, 0, 0, 1}, 0, 256))
-	tab.stuff([]*node{seed})
+	seed.livenessChecks = 1
+	fillTable(tab, []*node{seed})
 	results := tab.lookup(lookupTestnet.target, true)
 	t.Logf("results:")
@@ -578,6 +582,12 @@ func gen(typ interface{}, rand *rand.Rand) interface{} {
 	return v.Interface()
 }
+func genIP(rand *rand.Rand) net.IP {
+	ip := make(net.IP, 4)
+	rand.Read(ip)
+	return ip
+}
 func quickcfg() *quick.Config {
 	return &quick.Config{
 		MaxCount: 5000,

--- a/p2p/discover/table_util_test.go
+++ b/p2p/discover/table_util_test.go
@@ -83,6 +83,23 @@ func fillBucket(tab *Table, n *node) (last *node) {
 	return b.entries[bucketSize-1]
 }
+// fillTable adds nodes the table to the end of their corresponding bucket
+// if the bucket is not full. The caller must not hold tab.mutex.
+func fillTable(tab *Table, nodes []*node) {
+	tab.mutex.Lock()
+	defer tab.mutex.Unlock()
+	for _, n := range nodes {
+		if n.ID() == tab.self().ID() {
+			continue // don't add self
+		}
+		b := tab.bucket(n.ID())
+		if len(b.entries) < bucketSize {
+			tab.bumpOrAdd(b, n)
+		}
+	}
+}
 type pingRecorder struct {
 	mu           sync.Mutex
 	dead, pinged map[enode.ID]bool
@@ -109,10 +126,6 @@ func (t *pingRecorder) findnode(toid enode.ID, toaddr *net.UDPAddr, target encPu
 	return nil, nil
 }
-func (t *pingRecorder) waitping(from enode.ID) error {
-	return nil // remote always pings
-}
 func (t *pingRecorder) ping(toid enode.ID, toaddr *net.UDPAddr) error {
 	t.mu.Lock()
 	defer t.mu.Unlock()

--- a/p2p/discover/udp.go
+++ b/p2p/discover/udp.go
--- a/p2p/discover/udp_test.go
+++ b/p2p/discover/udp_test.go
--- a/p2p/enode/nodedb.go
+++ b/p2p/enode/nodedb.go
--- a/p2p/enode/nodedb_test.go
+++ b/p2p/enode/nodedb_test.go