update

2025-01-31 12:47:59 -05:00 · 2025-01-31 12:47:59 -05:00 · 43a3ba2a00
commit 43a3ba2a00
77 changed files with 75991 additions and 0 deletions
--- a/.check-build
+++ b/.check-build
@ -0,0 +1,137 @@
+#!/usr/bin/env bash
+
+set -eu
+
+REFERENCE_FILES=(
+    # lab 1
+    src/mrapps/crash.go
+    src/mrapps/indexer.go
+    src/mrapps/mtiming.go
+    src/mrapps/nocrash.go
+    src/mrapps/rtiming.go
+    src/mrapps/wc.go
+    src/main/mrsequential.go
+    src/main/mrcoordinator.go
+    src/main/mrworker.go
+
+    # lab 2
+    src/kvsrv/test_test.go
+    src/kvsrv/config.go
+
+    # lab 3
+    src/raft/persister.go
+    src/raft/test_test.go
+    src/raft/config.go
+    src/labrpc/labrpc.go
+
+    # lab 4
+    src/kvraft/test_test.go
+    src/kvraft/config.go
+
+    # lab 5a
+    src/shardctrler/test_test.go
+    src/shardctrler/config.go
+
+    # lab 5b
+    src/shardkv/test_test.go
+    src/shardkv/config.go
+)
+
+main() {
+    upstream="$1"
+    labnum="$2"
+
+    # make sure we have reference copy of lab, in FETCH_HEAD
+    git fetch "$upstream" 2>/dev/null || die "unable to git fetch $upstream"
+
+    # copy existing directory
+    tmpdir="$(mktemp -d)"
+    find src -type s -delete # cp can't copy sockets
+    cp -r src "$tmpdir"
+    orig="$PWD"
+    cd "$tmpdir"
+
+    # check out reference files
+    for f in ${REFERENCE_FILES[@]}; do
+        mkdir -p "$(dirname $f)"
+        git --git-dir="$orig/.git" show "FETCH_HEAD:$f" > "$f"
+    done
+
+    case $labnum in
+        "lab1") check_lab1;;
+        "lab2") check_lab2;;
+        "lab3a"|"lab3b"|"lab3c"|"lab3d") check_lab3;;
+        "lab4a"|"lab4b") check_lab4;;
+        "lab5a") check_lab5a;;
+        "lab5b") check_lab5b;;
+        *) die "unknown lab: $labnum";;
+    esac
+
+    cd
+    rm -rf "$tmpdir"
+}
+
+check_lab1() {
+    check_cmd cd src/mrapps
+    check_cmd go build -buildmode=plugin wc.go
+    check_cmd go build -buildmode=plugin indexer.go
+    check_cmd go build -buildmode=plugin mtiming.go
+    check_cmd go build -buildmode=plugin rtiming.go
+    check_cmd go build -buildmode=plugin crash.go
+    check_cmd go build -buildmode=plugin nocrash.go
+    check_cmd cd ../main
+    check_cmd go build mrcoordinator.go
+    check_cmd go build mrworker.go
+    check_cmd go build mrsequential.go
+}
+
+check_lab2() {
+    check_cmd cd src/kvsrv
+    check_cmd go test -c
+}
+
+check_lab3() {
+    check_cmd cd src/raft
+    check_cmd go test -c
+}
+
+check_lab4() {
+    check_cmd cd src/kvraft
+    check_cmd go test -c
+}
+
+check_lab5a() {
+    check_cmd cd src/shardctrler
+    check_cmd go test -c
+}
+
+check_lab5b() {
+    check_cmd cd src/shardkv
+    check_cmd go test -c
+    # also check other labs/parts
+    cd "$tmpdir"
+    check_lab5a
+    cd "$tmpdir"
+    check_lab4
+    cd "$tmpdir"
+    check_lab3
+}
+
+check_cmd() {
+    if ! "$@" >/dev/null 2>&1; then
+        echo "We tried building your source code with testing-related files reverted to original versions, and the build failed. This copy of your code is preserved in $tmpdir for debugging purposes. Please make sure the code you are trying to hand in does not make changes to test code." >&2
+        echo >&2
+        echo "The build failed while trying to run the following command:" >&2
+        echo >&2
+        echo "$ $@" >&2
+        echo "  (cwd: ${PWD#$tmpdir/})" >&2
+        exit 1
+    fi
+}
+
+die() {
+    echo "$1" >&2
+    exit 1
+}
+
+main "$@"
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+pkg/
+api.key
+.api.key.trimmed
+*-handin.tar.gz
--- a/34
+++ b/34
@ -0,0 +1,34 @@
+# This is the Makefile helping you submit the labs.
+# Just create 6.5840/api.key with your API key in it,
+# and submit your lab with the following command:
+#     $ make [lab1|lab2|lab3a|lab3b|lab3c|lab3d|lab4a|lab4b|lab5a|lab5b]
+
+LABS=" lab1 lab2 lab3a lab3b lab3c lab3d lab4a lab4b lab5a lab5b "
+
+%: check-%
+	@echo "Preparing $@-handin.tar.gz"
+	@if echo $(LABS) | grep -q " $@ " ; then \
+		echo "Tarring up your submission..." ; \
+		COPYFILE_DISABLE=1 tar cvzf $@-handin.tar.gz \
+			"--exclude=src/main/pg-*.txt" \
+			"--exclude=src/main/diskvd" \
+			"--exclude=src/mapreduce/824-mrinput-*.txt" \
+			"--exclude=src/mapreduce/5840-mrinput-*.txt" \
+			"--exclude=src/main/mr-*" \
+			"--exclude=mrtmp.*" \
+			"--exclude=src/main/diff.out" \
+			"--exclude=src/main/mrcoordinator" \
+			"--exclude=src/main/mrsequential" \
+			"--exclude=src/main/mrworker" \
+			"--exclude=*.so" \
+			Makefile src; \
+		if test `stat -c "%s" "$@-handin.tar.gz" 2>/dev/null || stat -f "%z" "$@-handin.tar.gz"` -ge 20971520 ; then echo "File exceeds 20MB."; rm $@-handin.tar.gz; exit; fi; \
+		echo "$@-handin.tar.gz successfully created. Please upload the tarball manually on Gradescope."; \
+	else \
+		echo "Bad target $@. Usage: make [$(LABS)]"; \
+	fi
+
+.PHONY: check-%
+check-%:
+	@echo "Checking that your submission builds correctly..."
+	@./.check-build git://g.csail.mit.edu/6.5840-golabs-2024 $(patsubst check-%,%,$@)
--- a/src/.gitignore
+++ b/src/.gitignore
@ -0,0 +1,12 @@
+*.*/
+main/mr-tmp/
+mrtmp.*
+824-mrinput-*.txt
+/main/diff.out
+/mapreduce/x.txt
+/pbservice/x.txt
+/kvpaxos/x.txt
+*.so
+/main/mrcoordinator
+/main/mrsequential
+/main/mrworker
--- a/src/go.mod
+++ b/src/go.mod
@ -0,0 +1,5 @@
+module 6.5840
+
+go 1.21
+
+require github.com/anishathalye/porcupine v1.0.0
--- a/src/go.sum
+++ b/src/go.sum
@ -0,0 +1,2 @@
+github.com/anishathalye/porcupine v1.0.0 h1:93eF6d26IMDky+G4h8FcLuYp1oO+no8a//I7asq/oKI=
+github.com/anishathalye/porcupine v1.0.0/go.mod h1:WM0SsFjWNl2Y4BqHr/E/ll2yY1GY1jqn+W7Z/84Zoog=
--- a/src/kvraft1/client.go
+++ b/src/kvraft1/client.go
@ -0,0 +1,31 @@
+package kvraft
+
+import (
+	"6.5840/kvsrv1/rpc"
+	"6.5840/kvtest1"
+	"6.5840/tester1"
+)
+
+
+type Clerk struct {
+	clnt    *tester.Clnt
+	servers []string
+	// You will have to modify this struct.
+}
+
+func MakeClerk(clnt *tester.Clnt, servers []string) kvtest.IKVClerk {
+	ck := &Clerk{clnt: clnt, servers: servers}
+	// You'll have to add code here.
+	return ck
+}
+
+func (ck *Clerk) Get(key string) (string, rpc.Tversion, rpc.Err) {
+
+	// You will have to modify this function.
+	return "", 0, ""
+}
+
+func (ck *Clerk) Put(key string, value string, version rpc.Tversion) rpc.Err {
+	// You will have to modify this function.
+	return ""
+}
--- a/src/kvraft1/kvraft_test.go
+++ b/src/kvraft1/kvraft_test.go
@ -0,0 +1,405 @@
+package kvraft
+
+import (
+	// "log"
+	"strconv"
+	"testing"
+	"time"
+
+	"6.5840/kvsrv1/rpc"
+	"6.5840/kvtest1"
+)
+
+const (
+	NSEC  = 1
+	NCLNT = 10
+)
+
+// Basic test is as follows: one or more clients submitting Puts/Gets
+// operations to set of servers for some period of time using
+// kvtest.OneClientPut.  After the period is over, test checks that all
+// puts/gets values form a linearizable history. If unreliable is set,
+// RPCs may fail.  If crash is set, the servers crash after the period
+// is over and restart.  If partitions is set, the test repartitions
+// the network concurrently with the clients and servers. If
+// maxraftstate is a positive number, the size of the state for Raft
+// (i.e., log size) shouldn't exceed 8*maxraftstate. If maxraftstate
+// is negative, snapshots shouldn't be used.
+func (ts *Test) GenericTest() {
+	const (
+		NITER = 3
+		T     = NSEC * time.Second
+		NKEYS = 100
+	)
+	// const T = 1 * time.Millisecond
+	defer ts.Cleanup()
+
+	ch_partitioner := make(chan bool)
+	ch_spawn := make(chan struct{})
+	ck := ts.MakeClerk()
+	res := kvtest.ClntRes{}
+	default_key := []string{"k"} // if not running with randomkeys
+	if ts.randomkeys {
+		default_key = kvtest.MakeKeys(NKEYS)
+	}
+	for i := 0; i < NITER; i++ {
+		// log.Printf("Iteration %v\n", i)
+
+		go func() {
+			rs := ts.SpawnClientsAndWait(ts.nclients, T, func(cli int, ck kvtest.IKVClerk, done chan struct{}) kvtest.ClntRes {
+				return ts.OneClientPut(cli, ck, default_key, done)
+			})
+			if !ts.randomkeys {
+				ts.CheckPutConcurrent(ck, default_key[0], rs, &res)
+			}
+			ch_spawn <- struct{}{}
+		}()
+
+		if ts.partitions {
+			// Allow the clients to perform some operations without interruption
+			time.Sleep(1 * time.Second)
+			go ts.Partitioner(Gid, ch_partitioner)
+		}
+
+		<-ch_spawn // wait for clients to be done
+
+		ts.CheckPorcupine()
+
+		if ts.partitions {
+			ch_partitioner <- true
+			// log.Printf("wait for partitioner\n")
+			<-ch_partitioner
+			// reconnect network and submit a request. A client may
+			// have submitted a request in a minority.  That request
+			// won't return until that server discovers a new term
+			// has started.
+			ts.Group(Gid).ConnectAll()
+			// wait for a while so that we have a new term
+			time.Sleep(kvtest.ElectionTimeout)
+		}
+
+		if ts.crash {
+			// log.Printf("shutdown servers\n")
+			for i := 0; i < ts.nservers; i++ {
+				ts.Group(Gid).ShutdownServer(i)
+			}
+			// Wait for a while for servers to shutdown, since
+			// shutdown isn't a real crash and isn't instantaneous
+			time.Sleep(kvtest.ElectionTimeout)
+			// log.Printf("restart servers\n")
+			// crash and re-start all
+			for i := 0; i < ts.nservers; i++ {
+				ts.Group(Gid).StartServer(i)
+			}
+			ts.Group(Gid).ConnectAll()
+		}
+
+		if ts.maxraftstate > 0 {
+			// Check maximum after the servers have processed all client
+			// requests and had time to checkpoint.
+			sz := ts.Config.Group(Gid).LogSize()
+			if sz > 8*ts.maxraftstate {
+				ts.Fatalf("logs were not trimmed (%v > 8*%v)", sz, ts.maxraftstate)
+			}
+		}
+		if ts.maxraftstate < 0 {
+			// Check that snapshots are not used
+			ssz := ts.Group(Gid).SnapshotSize()
+			if ssz > 0 {
+				ts.t.Fatalf("snapshot too large (%v), should not be used when maxraftstate = %d", ssz, ts.maxraftstate)
+			}
+		}
+	}
+}
+
+// check that ops are committed fast enough, better than 1 per heartbeat interval
+func (ts *Test) GenericTestSpeed() {
+	const numOps = 1000
+
+	defer ts.Cleanup()
+
+	ck := ts.MakeClerk()
+
+	// wait until first op completes, so we know a leader is elected
+	// and KV servers are ready to process client requests
+	ck.Get("x")
+
+	start := time.Now()
+	for i := 0; i < numOps; i++ {
+		if err := ck.Put("k", strconv.Itoa(i), rpc.Tversion(i)); err != rpc.OK {
+			ts.t.Fatalf("Put err %v", err)
+		}
+	}
+	dur := time.Since(start)
+
+	if _, ver, err := ck.Get("k"); err != rpc.OK {
+		ts.t.Fatalf("Get err %v", err)
+	} else if ver != numOps {
+		ts.t.Fatalf("Get too few ops %v", ver)
+	}
+
+	// heartbeat interval should be ~ 100 ms; require at least 3 ops per
+	const heartbeatInterval = 100 * time.Millisecond
+	const opsPerInterval = 3
+	const timePerOp = heartbeatInterval / opsPerInterval
+	if dur > numOps*timePerOp {
+		ts.t.Fatalf("Operations completed too slowly %v/op > %v/op\n", dur/numOps, timePerOp)
+	}
+}
+
+func TestBasic4A(t *testing.T) {
+	ts := MakeTest(t, "4A basic", 1, 5, true, false, false, -1, false)
+	ts.GenericTest()
+}
+
+func TestSpeed4A(t *testing.T) {
+	ts := MakeTest(t, "4A speed", 1, 3, true, false, false, -1, false)
+	ts.GenericTestSpeed()
+}
+
+func TestConcurrent4A(t *testing.T) {
+	ts := MakeTest(t, "4A many clients", 5, 5, true, false, false, -1, false)
+	ts.GenericTest()
+}
+
+func TestUnreliable4A(t *testing.T) {
+	ts := MakeTest(t, "4A unreliable net, many clients", 5, 5, false, false, false, -1, false)
+	ts.GenericTest()
+}
+
+// Submit a request in the minority partition and check that the requests
+// doesn't go through until the partition heals.  The leader in the original
+// network ends up in the minority partition.
+func TestOnePartition4A(t *testing.T) {
+	ts := MakeTest(t, "4A progress in majority", 0, 5, false, false, false, -1, false)
+	defer ts.Cleanup()
+
+	ck := ts.MakeClerk()
+
+	ver0 := ts.PutAtLeastOnce(ck, "1", "13", rpc.Tversion(0), -1)
+
+	p1, p2 := ts.Group(Gid).MakePartition()
+	ts.Group(Gid).Partition(p1, p2)
+
+	ckp1 := ts.MakeClerkTo(p1)  // connect ckp1 to p1
+	ckp2a := ts.MakeClerkTo(p2) // connect ckp2a to p2
+	ckp2b := ts.MakeClerkTo(p2) // connect ckp2b to p2
+
+	ver1 := ts.PutAtLeastOnce(ckp1, "1", "14", ver0+1, -1)
+	ts.CheckGet(ckp1, "1", "14", ver1)
+
+	ts.End()
+
+	done0 := make(chan rpc.Tversion)
+	done1 := make(chan rpc.Tversion)
+
+	ts.Begin("Test: no progress in minority (4A)")
+	go func() {
+		ver := ts.PutAtLeastOnce(ckp2a, "1", "15", ver1+1, -1)
+		done0 <- ver
+	}()
+	go func() {
+		_, ver, _ := ts.Get(ckp2b, "1", -1) // different clerk in p2
+		done1 <- ver
+	}()
+
+	select {
+	case ver := <-done0:
+		t.Fatalf("Put in minority completed %v", ver)
+	case ver := <-done1:
+		t.Fatalf("Get in minority completed %v", ver)
+	case <-time.After(time.Second):
+	}
+
+	ts.CheckGet(ckp1, "1", "14", ver1)
+	ver2 := ts.PutAtLeastOnce(ckp1, "1", "16", ver1+1, -1)
+	ts.CheckGet(ckp1, "1", "16", ver2)
+
+	ts.End()
+
+	ts.Begin("Test: completion after heal (4A)")
+
+	ts.Group(Gid).ConnectAll()
+	ckp2a.(*kvtest.TestClerk).Clnt.ConnectAll()
+	ckp2b.(*kvtest.TestClerk).Clnt.ConnectAll()
+
+	time.Sleep(kvtest.ElectionTimeout)
+
+	select {
+	case <-done0:
+	case <-time.After(30 * 100 * time.Millisecond):
+		t.Fatalf("Put did not complete")
+	}
+
+	select {
+	case <-done1:
+	case <-time.After(30 * 100 * time.Millisecond):
+		t.Fatalf("Get did not complete")
+	default:
+	}
+
+	ts.CheckGet(ck, "1", "15", ver2+1)
+}
+
+func TestManyPartitionsOneClient4A(t *testing.T) {
+	ts := MakeTest(t, "4A partitions, one client", 1, 5, false, false, true, -1, false)
+	ts.GenericTest()
+}
+
+func TestManyPartitionsManyClients4A(t *testing.T) {
+	ts := MakeTest(t, "4A partitions, many clients (4A)", 5, 5, false, false, true, -1, false)
+	ts.GenericTest()
+}
+
+func TestPersistOneClient4A(t *testing.T) {
+	ts := MakeTest(t, "4A restarts, one client 4A ", 1, 5, false, true, false, -1, false)
+	ts.GenericTest()
+}
+
+func TestPersistConcurrent4A(t *testing.T) {
+	ts := MakeTest(t, "4A restarts, many clients", 5, 5, false, true, false, -1, false)
+	ts.GenericTest()
+}
+
+func TestPersistConcurrentUnreliable4A(t *testing.T) {
+	ts := MakeTest(t, "4A unreliable net, restarts, many clients ", 5, 5, true, true, false, -1, false)
+	ts.GenericTest()
+}
+
+func TestPersistPartition4A(t *testing.T) {
+	ts := MakeTest(t, "4A restarts, partitions, many clients", 5, 5, false, true, true, -1, false)
+	ts.GenericTest()
+}
+
+func TestPersistPartitionUnreliable4A(t *testing.T) {
+	ts := MakeTest(t, "4A unreliable net, restarts, partitions, many clients", 5, 5, true, true, true, -1, false)
+	ts.GenericTest()
+}
+
+func TestPersistPartitionUnreliableLinearizable4A(t *testing.T) {
+	ts := MakeTest(t, "4A unreliable net, restarts, partitions, random keys, many clients", 15, 7, true, true, true, -1, true)
+	ts.GenericTest()
+}
+
+// if one server falls behind, then rejoins, does it
+// recover by using the InstallSnapshot RPC?
+// also checks that majority discards committed log entries
+// even if minority doesn't respond.
+func TestSnapshotRPC4B(t *testing.T) {
+	ts := MakeTest(t, "4B SnapshotsRPC", 0, 3, false, false, false, 1000, false)
+	defer ts.Cleanup()
+
+	ck := ts.MakeClerk()
+
+	ts.Begin("Test: InstallSnapshot RPC (4B)")
+
+	vera := ts.PutAtLeastOnce(ck, "a", "A", rpc.Tversion(0), -1)
+	ts.CheckGet(ck, "a", "A", vera)
+
+	verb := rpc.Tversion(0)
+	// a bunch of puts into the majority partition.
+	ts.Group(Gid).Partition([]int{0, 1}, []int{2})
+	{
+		ck1 := ts.MakeClerkTo([]int{0, 1})
+		for i := 0; i < 50; i++ {
+			verb = ts.PutAtLeastOnce(ck1, strconv.Itoa(i), strconv.Itoa(i), rpc.Tversion(0), -1)
+		}
+		time.Sleep(kvtest.ElectionTimeout)
+		verb = ts.PutAtLeastOnce(ck1, "b", "B", verb, -1)
+	}
+
+	// check that the majority partition has thrown away
+	// most of its log entries.
+	sz := ts.Group(Gid).LogSize()
+	if sz > 8*ts.maxraftstate {
+		t.Fatalf("logs were not trimmed (%v > 8*%v)", sz, ts.maxraftstate)
+	}
+
+	// now make group that requires participation of
+	// lagging server, so that it has to catch up.
+	ts.Group(Gid).Partition([]int{0, 2}, []int{1})
+	{
+		ck1 := ts.MakeClerkTo([]int{0, 2})
+		ts.PutAtLeastOnce(ck1, "c", "C", rpc.Tversion(0), -1)
+		ts.PutAtLeastOnce(ck1, "d", "D", rpc.Tversion(0), -1)
+		ts.CheckGet(ck1, "a", "A", vera)
+		ts.CheckGet(ck1, "b", "B", verb)
+		ts.CheckGet(ck1, "1", "1", rpc.Tversion(1))
+		ts.CheckGet(ck1, "49", "49", rpc.Tversion(1))
+	}
+
+	// now everybody
+	ts.Group(Gid).Partition([]int{0, 1, 2}, []int{})
+
+	vere := ts.PutAtLeastOnce(ck, "e", "E", rpc.Tversion(0), -1)
+	ts.CheckGet(ck, "c", "C", 1)
+	ts.CheckGet(ck, "e", "E", vere)
+	ts.CheckGet(ck, "1", "1", rpc.Tversion(1))
+}
+
+// are the snapshots not too huge? 500 bytes is a generous bound for the
+// operations we're doing here.
+func TestSnapshotSize4B(t *testing.T) {
+	ts := MakeTest(t, "4B snapshot size is reasonable", 0, 3, false, false, false, 1000, false)
+	defer ts.Cleanup()
+
+	maxsnapshotstate := 500
+
+	ck := ts.MakeClerk()
+
+	ver := rpc.Tversion(0)
+	for i := 0; i < 200; i++ {
+		ver = ts.PutAtLeastOnce(ck, "x", "0", ver, -1)
+		ts.CheckGet(ck, "x", "0", ver)
+		ver = ts.PutAtLeastOnce(ck, "x", "1", ver+1, -1)
+		ts.CheckGet(ck, "x", "1", ver)
+		ver += 1
+	}
+
+	// check that servers have thrown away most of their log entries
+	sz := ts.Group(Gid).LogSize()
+	if sz > 8*ts.maxraftstate {
+		t.Fatalf("logs were not trimmed (%v > 8*%v)", sz, ts.maxraftstate)
+	}
+
+	// check that the snapshots are not unreasonably large
+	ssz := ts.Group(Gid).SnapshotSize()
+	if ssz > maxsnapshotstate {
+		t.Fatalf("snapshot too large (%v > %v)", ssz, maxsnapshotstate)
+	}
+}
+
+func TestSpeed4B(t *testing.T) {
+	ts := MakeTest(t, "4B speed", 1, 3, true, false, false, 1000, false)
+	ts.GenericTestSpeed()
+}
+
+func TestSnapshotRecover4B(t *testing.T) {
+	ts := MakeTest(t, "4B restarts, snapshots, one client", 1, 5, true, true, false, 1000, false)
+	ts.GenericTest()
+}
+
+func TestSnapshotRecoverManyClients4B(t *testing.T) {
+	ts := MakeTest(t, "4B restarts, snapshots, many clients ", 20, 5, true, true, false, 1000, false)
+	ts.GenericTest()
+}
+
+func TestSnapshotUnreliable4B(t *testing.T) {
+	ts := MakeTest(t, "4B unreliable net, snapshots, many clients", 5, 5, false, false, false, 1000, false)
+	ts.GenericTest()
+}
+
+func TestSnapshotUnreliableRecover4B(t *testing.T) {
+	ts := MakeTest(t, "4B unreliable net, restarts, snapshots, many clients", 5, 5, false, true, false, 1000, false)
+	ts.GenericTest()
+}
+
+func TestSnapshotUnreliableRecoverConcurrentPartition4B(t *testing.T) {
+	ts := MakeTest(t, "4B unreliable net, restarts, partitions, snapshots, many clients", 5, 5, false, true, true, 1000, false)
+	ts.GenericTest()
+}
+
+func TestSnapshotUnreliableRecoverConcurrentPartitionLinearizable4B(t *testing.T) {
+	ts := MakeTest(t, "4B unreliable net, restarts, partitions, snapshots, random keys, many clients", 15, 7, false, true, true, 1000, true)
+	ts.GenericTest()
+}
--- a/src/kvraft1/rsm/rsm.go
+++ b/src/kvraft1/rsm/rsm.go
@ -0,0 +1,88 @@
+package rsm
+
+import (
+	"sync"
+
+	"6.5840/kvsrv1/rpc"
+	"6.5840/labrpc"
+	"6.5840/raft"
+
+)
+
+type Op struct {
+	// Your definitions here.
+	// Field names must start with capital letters,
+	// otherwise RPC will break.
+}
+
+
+// A server (i.e., ../server.go) that wants to replicate itself calls
+// MakeRSM and must implement the StateMachine interface.  This
+// interface allows the rsm package to interact with the server for
+// server-specific operations: the server must implement DoOp to
+// execute an operation (e.g., a Get or Put request), and
+// Snapshot/Restore to snapshot and restore the server's state.
+type StateMachine interface {
+	DoOp(any) any
+	Snapshot() []byte
+	Restore([]byte)
+}
+
+type RSM struct {
+	mu           sync.Mutex
+	me           int
+	rf           *raft.Raft
+	applyCh      chan raft.ApplyMsg
+	maxraftstate int // snapshot if log grows this big
+	sm           StateMachine
+	// Your definitions here.
+}
+
+// servers[] contains the ports of the set of
+// servers that will cooperate via Raft to
+// form the fault-tolerant key/value service.
+// me is the index of the current server in servers[].
+// the k/v server should store snapshots through the underlying Raft
+// implementation, which should call persister.SaveStateAndSnapshot() to
+// atomically save the Raft state along with the snapshot.
+// The RSM should snapshot when Raft's saved state exceeds maxraftstate bytes,
+// in order to allow Raft to garbage-collect its log. if maxraftstate is -1,
+// you don't need to snapshot.
+//
+// MakeRSM() must return quickly, so it should start goroutines for
+// any long-running work.
+func MakeRSM(servers []*labrpc.ClientEnd, me int, persister *raft.Persister, maxraftstate int, sm StateMachine) *RSM {
+	rsm := &RSM{
+		me:           me,
+		maxraftstate: maxraftstate,
+		applyCh:      make(chan raft.ApplyMsg),
+		sm:           sm,
+	}
+	rsm.rf = raft.Make(servers, me, persister, rsm.applyCh)
+	return rsm
+}
+
+func (rsm *RSM) Raft() *raft.Raft {
+	return rsm.rf
+}
+
+
+// submit a command to Raft,
+// and wait for it to be committed.
+// perform() will tell us via ClientStatus and lastApplied
+// when our command is either executed or not.
+//
+// returns (executeError, executeResult)
+// if executeError==ErrWrongLeader, client should find new leader
+// and try again.
+func (rsm *RSM) Submit(req any) (rpc.Err, any) {
+	rsm.mu.Lock()
+	defer rsm.mu.Unlock()
+
+	// Submit creates an Op structure to run a command through Raft;
+	// for example: op := Op{Id: rsm.nextId, Req: req}, where req is
+	// the argument to Submit and rsm.nextId a unique id for the op.
+
+	// your code here
+	return rpc.ErrWrongLeader, nil // i'm dead, try another server.
+}
--- a/src/kvraft1/rsm/rsm_test.go
+++ b/src/kvraft1/rsm/rsm_test.go
@ -0,0 +1,63 @@
+package rsm
+
+import (
+	//"log"
+	"testing"
+)
+
+// test that each server executes increments and updates its counter.
+func TestBasic(t *testing.T) {
+	ts := makeTest(t, -1)
+	defer ts.cleanup()
+
+	ts.Begin("Test RSM basic")
+	for i := 0; i < 10; i++ {
+		r := ts.one()
+		if r.N != i+1 {
+			ts.Fatalf("expected %d instead of %d", i, r.N)
+		}
+		ts.checkCounter(r.N, NSRV)
+	}
+}
+
+// test that each server executes increments after disconnecting and
+// reconnecting leader
+func TestLeaderFailure(t *testing.T) {
+	ts := makeTest(t, -1)
+	defer ts.cleanup()
+
+	r := ts.one()
+	ts.checkCounter(r.N, NSRV)
+
+	l := ts.disconnectLeader()
+	r = ts.one()
+	ts.checkCounter(r.N, NSRV-1)
+
+	ts.connect(l)
+
+	ts.checkCounter(r.N, NSRV)
+}
+
+// test snapshot and restore
+func TestSnapshot(t *testing.T) {
+	const N = 100
+
+	ts := makeTest(t, 1000)
+	defer ts.cleanup()
+
+	for i := 0; i < N; i++ {
+		ts.one()
+	}
+	ts.checkCounter(N, NSRV)
+
+	// rsm must have made snapshots by now shutdown all servers and
+	// restart them from a snapshot
+
+	ts.g.Shutdown()
+	ts.g.StartServers()
+
+	// make restarted servers do one increment
+	ts.one()
+
+	ts.checkCounter(N+1, NSRV)
+}
--- a/src/kvraft1/rsm/server.go
+++ b/src/kvraft1/rsm/server.go
@ -0,0 +1,77 @@
+package rsm
+
+import (
+	"bytes"
+	"log"
+	"sync"
+
+	"6.5840/labgob"
+	// "6.5840/kvtest1"
+	"6.5840/labrpc"
+	"6.5840/raft"
+)
+
+type Inc struct {
+}
+
+type Rep struct {
+	N int
+}
+
+type rsmSrv struct {
+	ts      *Test
+	me      int
+	rsm     *RSM
+	mu      sync.Mutex
+	counter int
+}
+
+func makeRsmSrv(ts *Test, srv int, ends []*labrpc.ClientEnd, persister *raft.Persister, snapshot bool) *rsmSrv {
+	//log.Printf("mksrv %d", srv)
+	labgob.Register(Op{})
+	labgob.Register(Inc{})
+	labgob.Register(Rep{})
+	s := &rsmSrv{
+		ts: ts,
+		me: srv,
+	}
+	s.rsm = MakeRSM(ends, srv, persister, ts.maxraftstate, s)
+	return s
+}
+
+func (rs *rsmSrv) DoOp(req any) any {
+	//log.Printf("%d: DoOp: %v", rs.me, req)
+	rs.counter += 1
+	return &Rep{rs.counter}
+}
+
+func (rs *rsmSrv) Snapshot() []byte {
+	//log.Printf("%d: snapshot", rs.me)
+	w := new(bytes.Buffer)
+	e := labgob.NewEncoder(w)
+	e.Encode(rs.counter)
+	return w.Bytes()
+}
+
+func (rs *rsmSrv) Restore(data []byte) {
+	r := bytes.NewBuffer(data)
+	d := labgob.NewDecoder(r)
+	if d.Decode(&rs.counter) != nil {
+		log.Fatalf("%v couldn't decode counter", rs.me)
+	}
+	//log.Printf("%d: restore %d", rs.me, rs.counter)
+}
+
+func (rs *rsmSrv) Kill() {
+	rs.mu.Lock()
+	defer rs.mu.Unlock()
+	//log.Printf("kill %d", rs.me)
+	//rs.rsm.Kill()
+	rs.rsm = nil
+}
+
+func (rs *rsmSrv) Raft() *raft.Raft {
+	rs.mu.Lock()
+	defer rs.mu.Unlock()
+	return rs.rsm.Raft()
+}
--- a/src/kvraft1/rsm/test.go
+++ b/src/kvraft1/rsm/test.go
@ -0,0 +1,113 @@
+package rsm
+
+import (
+	//"log"
+	"testing"
+	"time"
+
+	"6.5840/kvsrv1/rpc"
+	"6.5840/labrpc"
+	"6.5840/raft"
+	"6.5840/tester1"
+)
+
+type Test struct {
+	*tester.Config
+	t            *testing.T
+	g            *tester.ServerGrp
+	maxraftstate int
+	srvs         []*rsmSrv
+	leader       int
+}
+
+const (
+	NSRV = 3
+	NSEC = 10
+)
+
+func makeTest(t *testing.T, maxraftstate int) *Test {
+	ts := &Test{
+		t:            t,
+		maxraftstate: maxraftstate,
+		srvs:         make([]*rsmSrv, NSRV),
+	}
+	ts.Config = tester.MakeConfig(t, NSRV, true, maxraftstate, ts.mksrv)
+	ts.g = ts.Group(tester.GRP0)
+	return ts
+}
+
+func (ts *Test) cleanup() {
+	ts.End()
+	ts.Config.Cleanup()
+	ts.CheckTimeout()
+}
+
+func (ts *Test) mksrv(ends []*labrpc.ClientEnd, grp tester.Tgid, srv int, persister *raft.Persister, maxraftstate int) tester.IKVServer {
+	s := makeRsmSrv(ts, srv, ends, persister, false)
+	ts.srvs[srv] = s
+	return s
+}
+
+func (ts *Test) one() *Rep {
+	// try all the servers, maybe one is the leader but give up after NSEC
+	t0 := time.Now()
+	for time.Since(t0).Seconds() < NSEC {
+		index := ts.leader
+		for range ts.srvs {
+			if ts.g.IsConnected(index) {
+				s := ts.srvs[index]
+				if s.rsm != nil {
+					err, rep := s.rsm.Submit(Inc{})
+					if err == rpc.OK {
+						ts.leader = index
+						//log.Printf("leader = %d", ts.leader)
+						return rep.(*Rep)
+					}
+				}
+			}
+			index = (index + 1) % len(ts.srvs)
+		}
+		time.Sleep(50 * time.Millisecond)
+		//log.Printf("try again: no leader")
+	}
+
+	ts.Fatalf("one: took too long")
+	return nil
+}
+
+func (ts *Test) checkCounter(v int, nsrv int) {
+	to := 10 * time.Millisecond
+	n := 0
+	for iters := 0; iters < 30; iters++ {
+		n = ts.countValue(v)
+		if n >= nsrv {
+			return
+		}
+		time.Sleep(to)
+		if to < time.Second {
+			to *= 2
+		}
+	}
+	ts.Fatalf("checkCounter: only %d srvs have %v instead of %d", n, v, nsrv)
+}
+
+func (ts *Test) countValue(v int) int {
+	i := 0
+	for _, s := range ts.srvs {
+		if s.counter == v {
+			i += 1
+		}
+	}
+	return i
+}
+
+func (ts *Test) disconnectLeader() int {
+	//log.Printf("disconnect %d", ts.leader)
+	ts.g.DisconnectAll(ts.leader)
+	return ts.leader
+}
+
+func (ts *Test) connect(i int) {
+	//log.Printf("connect %d", i)
+	ts.g.ConnectOne(i)
+}
--- a/src/kvraft1/test.go
+++ b/src/kvraft1/test.go
@ -0,0 +1,86 @@
+package kvraft
+
+import (
+	"testing"
+
+	"6.5840/kvtest1"
+	"6.5840/tester1"
+)
+
+type Test struct {
+	t *testing.T
+	*kvtest.Test
+	part         string
+	nclients     int
+	nservers     int
+	crash        bool
+	partitions   bool
+	maxraftstate int
+	randomkeys   bool
+}
+
+const Gid = tester.GRP0
+
+func MakeTest(t *testing.T, part string, nclients, nservers int, reliable bool, crash bool, partitions bool, maxraftstate int, randomkeys bool) *Test {
+	cfg := tester.MakeConfig(t, nservers, reliable, maxraftstate, StartKVServer)
+	ts := &Test{
+		t:            t,
+		part:         part,
+		nclients:     nclients,
+		nservers:     nservers,
+		crash:        crash,
+		partitions:   partitions,
+		maxraftstate: maxraftstate,
+		randomkeys:   randomkeys,
+	}
+	ts.Test = kvtest.MakeTest(t, cfg, randomkeys, ts)
+	ts.Begin(ts.makeTitle())
+	return ts
+}
+
+func (ts *Test) MakeClerk() kvtest.IKVClerk {
+	clnt := ts.Config.MakeClient()
+	ck := MakeClerk(clnt, ts.Group(Gid).SrvNames())
+	return &kvtest.TestClerk{ck, clnt}
+}
+
+func (ts *Test) DeleteClerk(ck kvtest.IKVClerk) {
+	tck := ck.(*kvtest.TestClerk)
+	ts.DeleteClient(tck.Clnt)
+}
+
+func (ts *Test) MakeClerkTo(to []int) kvtest.IKVClerk {
+	ns := ts.Config.Group(Gid).SrvNamesTo(to)
+	clnt := ts.Config.MakeClientTo(ns)
+	ck := MakeClerk(clnt, ts.Group(Gid).SrvNames())
+	return &kvtest.TestClerk{ck, clnt}
+}
+
+func (ts *Test) cleanup() {
+	ts.Test.Cleanup()
+}
+
+func (ts *Test) makeTitle() string {
+	title := "Test: "
+	if ts.crash {
+		// peers re-start, and thus persistence must work.
+		title = title + "restarts, "
+	}
+	if ts.partitions {
+		// the network may partition
+		title = title + "partitions, "
+	}
+	if ts.maxraftstate != -1 {
+		title = title + "snapshots, "
+	}
+	if ts.randomkeys {
+		title = title + "random keys, "
+	}
+	if ts.nclients > 1 {
+		title = title + "many clients"
+	} else {
+		title = title + "one client"
+	}
+	title = title + " (" + ts.part + ")" // 4A or 4B
+	return title
+}
--- a/src/kvsrv1/client.go
+++ b/src/kvsrv1/client.go
@ -0,0 +1,57 @@
+package kvsrv
+
+import (
+	"6.5840/kvsrv1/rpc"
+	"6.5840/kvtest1"
+	"6.5840/tester1"
+)
+
+
+type Clerk struct {
+	clnt   *tester.Clnt
+	server string
+}
+
+func MakeClerk(clnt *tester.Clnt, server string) kvtest.IKVClerk {
+	ck := &Clerk{clnt: clnt, server: server}
+	// You may add code here.
+	return ck
+}
+
+// Get fetches the current value and version for a key.  It returns
+// ErrNoKey if the key does not exist. It keeps trying forever in the
+// face of all other errors.
+//
+// You can send an RPC with code like this:
+// ok := ck.clnt.Call(ck.server, "KVServer.Get", &args, &reply)
+//
+// the types of args and reply (including whether they are pointers)
+// must match the declared types of the RPC handler function's
+// arguments. and reply must be passed as a pointer.
+func (ck *Clerk) Get(key string) (string, rpc.Tversion, rpc.Err) {
+
+	// You will have to modify this function.
+	return "", 0, rpc.ErrNoKey
+}
+
+// Put updates key with value only if version is the version in the
+// request matches the version of the key at the server.  If the
+// versions numbers don't match, the server should return
+// ErrNoVersion.  If Put receives an ErrVersion on its first RPC, Put
+// should return ErrVersion, since the Put was definitely not
+// performed at the server. If the server returns ErrVersion on a
+// resend RPC, then Put must return ErrMaybe to the application, since
+// its earlier RPC might have een processed by the server successfully
+// but the response was lost, and the the Clerk doesn't know if
+// the Put was performed or not.
+//
+// You can send an RPC with code like this:
+// ok := ck.clnt.Call(ck.server, "KVServer.Put", &args, &reply)
+//
+// the types of args and reply (including whether they are pointers)
+// must match the declared types of the RPC handler function's
+// arguments. and reply must be passed as a pointer.
+func (ck *Clerk) Put(key, value string, version rpc.Tversion) rpc.Err {
+	// You will have to modify this function.
+	return rpc.ErrNoKey
+}
--- a/src/kvsrv1/kvsrv_test.go
+++ b/src/kvsrv1/kvsrv_test.go
@ -0,0 +1,162 @@
+package kvsrv
+
+import (
+	// "log"
+	"runtime"
+	"testing"
+	"time"
+
+	"6.5840/kvsrv1/rpc"
+	"6.5840/kvtest1"
+)
+
+// Test Put with a single client and a reliable network
+func TestReliablePut(t *testing.T) {
+	const Val = "6.5840"
+	const Ver = 0
+
+	ts := MakeTestKV(t, true)
+	defer ts.Cleanup()
+
+	ts.Begin("One client and reliable Put")
+
+	ck := ts.MakeClerk()
+	if err := ck.Put("k", Val, Ver); err != rpc.OK {
+		t.Fatalf("Put err %v", err)
+	}
+
+	if val, ver, err := ck.Get("k"); err != rpc.OK {
+		t.Fatalf("Get err %v; expected OK", err)
+	} else if val != Val {
+		t.Fatalf("Get value err %v; expected %v", val, Val)
+	} else if ver != Ver+1 {
+		t.Fatalf("Get wrong version %v; expected %v", ver, Ver+1)
+	}
+
+	if err := ck.Put("k", Val, 0); err != rpc.ErrVersion {
+		t.Fatalf("expected Put to fail with ErrVersion; got err=%v", err)
+	}
+
+	if err := ck.Put("y", Val, rpc.Tversion(1)); err != rpc.ErrNoKey {
+		t.Fatalf("expected Put to fail with ErrNoKey; got err=%v", err)
+	}
+
+	if _, _, err := ck.Get("y"); err != rpc.ErrNoKey {
+		t.Fatalf("expected Get to fail with ErrNoKey; got err=%v", err)
+	}
+}
+
+// Many clients putting on same key.
+func TestPutConcurrentReliable(t *testing.T) {
+	const (
+		PORCUPINETIME = 10 * time.Second
+		NCLNT         = 10
+		NSEC          = 1
+	)
+
+	ts := MakeTestKV(t, true)
+	defer ts.Cleanup()
+
+	ts.Begin("Test: many clients racing to put values to the same key")
+
+	rs := ts.SpawnClientsAndWait(NCLNT, NSEC*time.Second, func(me int, ck kvtest.IKVClerk, done chan struct{}) kvtest.ClntRes {
+		return ts.OneClientPut(me, ck, []string{"k"}, done)
+	})
+	ck := ts.MakeClerk()
+	ts.CheckPutConcurrent(ck, "k", rs, &kvtest.ClntRes{})
+	ts.CheckPorcupineT(PORCUPINETIME)
+}
+
+// Check if memory used on server is reasonable
+func TestMemPutManyClientsReliable(t *testing.T) {
+	const (
+		NCLIENT = 100_000
+		MEM     = 1000
+	)
+
+	ts := MakeTestKV(t, true)
+	defer ts.Cleanup()
+
+	v := kvtest.RandValue(MEM)
+
+	cks := make([]kvtest.IKVClerk, NCLIENT)
+	for i, _ := range cks {
+		cks[i] = ts.MakeClerk()
+	}
+
+	// force allocation of ends for server in each client
+	for i := 0; i < NCLIENT; i++ {
+		if err := cks[i].Put("k", "", 1); err != rpc.ErrNoKey {
+			t.Fatalf("Put failed %v", err)
+		}
+	}
+
+	ts.Begin("Test: memory use many put clients")
+
+	// allow threads started by labrpc to start
+	time.Sleep(1 * time.Second)
+
+	runtime.GC()
+	runtime.GC()
+
+	var st runtime.MemStats
+	runtime.ReadMemStats(&st)
+	m0 := st.HeapAlloc
+
+	for i := 0; i < NCLIENT; i++ {
+		if err := cks[i].Put("k", v, rpc.Tversion(i)); err != rpc.OK {
+			t.Fatalf("Put failed %v", err)
+		}
+	}
+
+	runtime.GC()
+	time.Sleep(1 * time.Second)
+	runtime.GC()
+
+	runtime.ReadMemStats(&st)
+	m1 := st.HeapAlloc
+	f := (float64(m1) - float64(m0)) / NCLIENT
+	if m1 > m0+(NCLIENT*200) {
+		t.Fatalf("error: server using too much memory %d %d (%.2f per client)\n", m0, m1, f)
+	}
+}
+
+// Test with one client and unreliable network. If Clerk.Put returns
+// ErrMaybe, the Put must have happened, since the test uses only one
+// client.
+func TestUnreliableNet(t *testing.T) {
+	const NTRY = 100
+
+	ts := MakeTestKV(t, false)
+	defer ts.Cleanup()
+
+	ts.Begin("One client")
+
+	ck := ts.MakeClerk()
+
+	retried := false
+	for try := 0; try < NTRY; try++ {
+		for i := 0; true; i++ {
+			if err := ts.PutJson(ck, "k", i, rpc.Tversion(try), 0); err != rpc.ErrMaybe {
+				if i > 0 && err != rpc.ErrVersion {
+					t.Fatalf("Put shouldn't have happen more than once %v", err)
+				}
+				break
+			}
+			// Try put again; it should fail with ErrVersion
+			retried = true
+		}
+		v := 0
+		if ver := ts.GetJson(ck, "k", 0, &v); ver != rpc.Tversion(try+1) {
+			t.Fatalf("Wrong version %d expect %d", ver, try+1)
+		}
+		if v != 0 {
+			t.Fatalf("Wrong value %d expect %d", v, 0)
+		}
+	}
+	if !retried {
+		t.Fatalf("Clerk.Put never returned ErrMaybe")
+	}
+
+	ts.CheckPorcupine()
+}
--- a/src/kvsrv1/lock/lock.go
+++ b/src/kvsrv1/lock/lock.go
@ -0,0 +1,30 @@
+package lock
+
+import (
+	"6.5840/kvtest1"
+)
+
+type Lock struct {
+	// IKVClerk is a go interface for k/v clerks: the interfaces hides
+	// the specific Clerk type of ck but promises that ck supports
+	// Put and Get.  The tester passes the clerk in when calling
+	// MakeLock().
+	ck kvtest.IKVClerk
+	// You may add code here
+}
+
+// The tester calls MakeLock() and passes in a k/v clerk; you code can
+// perform a Put or Get by calling lk.ck.Put() or lk.ck.Get().
+func MakeLock(ck kvtest.IKVClerk, l string) *Lock {
+	lk := &Lock{ck: ck}
+	// You may add code here
+	return lk
+}
+
+func (lk *Lock) Acquire() {
+	// Your code here
+}
+
+func (lk *Lock) Release() {
+	// Your code here
+}
--- a/src/kvsrv1/lock/lock_test.go
+++ b/src/kvsrv1/lock/lock_test.go
@ -0,0 +1,89 @@
+package lock
+
+import (
+	"fmt"
+	//	"log"
+	"strconv"
+	"testing"
+	"time"
+
+	"6.5840/kvsrv1"
+	"6.5840/kvsrv1/rpc"
+	"6.5840/kvtest1"
+)
+
+const (
+	NACQUIRE = 10
+	NCLNT    = 10
+	NSEC     = 2
+)
+
+func oneClient(t *testing.T, me int, ck kvtest.IKVClerk, done chan struct{}) kvtest.ClntRes {
+	lk := MakeLock(ck, "l")
+	ck.Put("l0", "", 0)
+	for i := 1; true; i++ {
+		select {
+		case <-done:
+			return kvtest.ClntRes{i, 0}
+		default:
+			lk.Acquire()
+
+			// log.Printf("%d: acquired lock", me)
+
+			b := strconv.Itoa(me)
+			val, ver, err := ck.Get("l0")
+			if err == rpc.OK {
+				if val != "" {
+					t.Fatalf("%d: two clients acquired lock %v", me, val)
+				}
+			} else {
+				t.Fatalf("%d: get failed %v", me, err)
+			}
+
+			err = ck.Put("l0", string(b), ver)
+			if !(err == rpc.OK || err == rpc.ErrMaybe) {
+				t.Fatalf("%d: put failed %v", me, err)
+			}
+
+			time.Sleep(10 * time.Millisecond)
+
+			err = ck.Put("l0", "", ver+1)
+			if !(err == rpc.OK || err == rpc.ErrMaybe) {
+				t.Fatalf("%d: put failed %v", me, err)
+			}
+
+			// log.Printf("%d: release lock", me)
+
+			lk.Release()
+		}
+	}
+	return kvtest.ClntRes{}
+}
+
+// Run test clients
+func runClients(t *testing.T, nclnt int, reliable bool) {
+	ts := kvsrv.MakeTestKV(t, reliable)
+	defer ts.Cleanup()
+
+	ts.Begin(fmt.Sprintf("Test: %d lock clients", nclnt))
+
+	ts.SpawnClientsAndWait(nclnt, NSEC*time.Second, func(me int, myck kvtest.IKVClerk, done chan struct{}) kvtest.ClntRes {
+		return oneClient(t, me, myck, done)
+	})
+}
+
+func TestOneClientReliable(t *testing.T) {
+	runClients(t, 1, true)
+}
+
+func TestManyClientsReliable(t *testing.T) {
+	runClients(t, NCLNT, true)
+}
+
+func TestOneClientUnreliable(t *testing.T) {
+	runClients(t, 1, false)
+}
+
+func TestManyClientsUnreliable(t *testing.T) {
+	runClients(t, NCLNT, false)
+}
--- a/src/kvsrv1/rpc/rpc.go
+++ b/src/kvsrv1/rpc/rpc.go
@ -0,0 +1,39 @@
+package rpc
+
+type Err string
+
+const (
+	// Err's returned by server and Clerk
+	OK         = "OK"
+	ErrNoKey   = "ErrNoKey"
+	ErrVersion = "ErrVersion"
+
+	// Err returned by Clerk only
+	ErrMaybe = "ErrMaybe"
+
+	// For future kvraft lab
+	ErrWrongLeader = "ErrWrongLeader"
+	ErrWrongGroup  = "ErrWrongGroup"
+)
+
+type Tversion uint64
+
+type PutArgs struct {
+	Key     string
+	Value   string
+	Version Tversion
+}
+
+type PutReply struct {
+	Err Err
+}
+
+type GetArgs struct {
+	Key string
+}
+
+type GetReply struct {
+	Value   string
+	Version Tversion
+	Err     Err
+}
--- a/src/kvsrv1/server.go
+++ b/src/kvsrv1/server.go
@ -0,0 +1,63 @@
+package kvsrv
+
+import (
+	"log"
+	"sync"
+
+	"6.5840/kvsrv1/rpc"
+	"6.5840/labrpc"
+	"6.5840/raft"
+	"6.5840/tester1"
+)
+
+const Debug = false
+
+func DPrintf(format string, a ...interface{}) (n int, err error) {
+	if Debug {
+		log.Printf(format, a...)
+	}
+	return
+}
+
+
+type KVServer struct {
+	mu sync.Mutex
+
+	// Your definitions here.
+}
+
+func MakeKVServer() *KVServer {
+	kv := &KVServer{}
+	// Your code here.
+	return kv
+}
+
+// Get returns the value and version for args.Key, if args.Key
+// exists. Otherwise, Get returns ErrNoKey.
+func (kv *KVServer) Get(args *rpc.GetArgs, reply *rpc.GetReply) {
+	// Your code here.
+}
+
+// Update the value for a key if args.Version matches the version of
+// the key on the server. If versions don't match, return ErrVersion.
+// If the key doesn't exist, Put installs the value if the
+// Args.Version is 0.
+func (kv *KVServer) Put(args *rpc.PutArgs, reply *rpc.PutReply) {
+	// Your code here.
+}
+
+// You can ignore for this lab
+func (kv *KVServer) Kill() {
+}
+
+// You can ignore for this lab
+func (kv *KVServer) Raft() *raft.Raft {
+	return nil
+}
+
+
+// You can ignore all arguments; they are for replicated KVservers in lab 4
+func StartKVServer(ends []*labrpc.ClientEnd, gid tester.Tgid, srv int, persister *raft.Persister, maxraftstate int) tester.IKVServer {
+	kv := MakeKVServer()
+	return kv
+}
--- a/src/kvsrv1/test.go
+++ b/src/kvsrv1/test.go
@ -0,0 +1,36 @@
+package kvsrv
+
+import (
+	// "log"
+	"testing"
+
+	"6.5840/kvtest1"
+	"6.5840/tester1"
+)
+
+type TestKV struct {
+	*kvtest.Test
+	t        *testing.T
+	reliable bool
+}
+
+func MakeTestKV(t *testing.T, reliable bool) *TestKV {
+	cfg := tester.MakeConfig(t, 1, reliable, -1, StartKVServer)
+	ts := &TestKV{
+		t:        t,
+		reliable: reliable,
+	}
+	ts.Test = kvtest.MakeTest(t, cfg, false, ts)
+	return ts
+}
+
+func (ts *TestKV) MakeClerk() kvtest.IKVClerk {
+	clnt := ts.Config.MakeClient()
+	ck := MakeClerk(clnt, tester.ServerName(tester.GRP0, 0))
+	return &kvtest.TestClerk{ck, clnt}
+}
+
+func (ts *TestKV) DeleteClerk(ck kvtest.IKVClerk) {
+	tck := ck.(*kvtest.TestClerk)
+	ts.DeleteClient(tck.Clnt)
+}
--- a/src/kvtest1/kvtest.go
+++ b/src/kvtest1/kvtest.go
@ -0,0 +1,360 @@
+package kvtest
+
+import (
+	"encoding/json"
+	// "log"
+	"math/rand"
+	"strconv"
+	"testing"
+	"time"
+
+	"6.5840/kvsrv1/rpc"
+	"6.5840/tester1"
+)
+
+// The tester generously allows solutions to complete elections in one second
+// (much more than the paper's range of timeouts).
+const ElectionTimeout = 1 * time.Second
+
+func RandValue(n int) string {
+	const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
+
+	b := make([]byte, n)
+	for i := range b {
+		b[i] = letterBytes[rand.Int63()%int64(len(letterBytes))]
+	}
+	return string(b)
+}
+
+type IKVClerk interface {
+	Get(string) (string, rpc.Tversion, rpc.Err)
+	Put(string, string, rpc.Tversion) rpc.Err
+}
+
+type TestClerk struct {
+	IKVClerk
+	Clnt *tester.Clnt
+}
+
+type IClerkMaker interface {
+	MakeClerk() IKVClerk
+	DeleteClerk(IKVClerk)
+}
+
+type Test struct {
+	*tester.Config
+	t          *testing.T
+	oplog      *OpLog
+	mck        IClerkMaker
+	randomkeys bool
+}
+
+func MakeTest(t *testing.T, cfg *tester.Config, randomkeys bool, mck IClerkMaker) *Test {
+	ts := &Test{
+		Config:     cfg,
+		t:          t,
+		mck:        mck,
+		oplog:      &OpLog{},
+		randomkeys: randomkeys,
+	}
+	return ts
+}
+
+func (ts *Test) Cleanup() {
+	ts.Config.End()
+	ts.Config.Cleanup()
+}
+
+func (ts *Test) ConnectClnts(clnts []*tester.Clnt) {
+	for _, c := range clnts {
+		c.ConnectAll()
+	}
+}
+
+func (ts *Test) MakeClerk() IKVClerk {
+	return ts.mck.MakeClerk()
+}
+
+func (ts *Test) PutAtLeastOnce(ck IKVClerk, key, value string, ver rpc.Tversion, me int) rpc.Tversion {
+	for true {
+		if err := ts.Put(ck, key, value, ver, me); err == rpc.OK {
+			break
+		}
+		ver += 1
+	}
+	return ver
+}
+
+func (ts *Test) CheckGet(ck IKVClerk, key, value string, version rpc.Tversion) {
+	val, ver, err := ts.Get(ck, key, 0)
+	if err != rpc.OK {
+		ts.Fatalf("CheckGet err %v", err)
+	}
+	if val != value || ver != ver {
+		ts.Fatalf("Get(%v): expected:\n%v %v\nreceived:\n%v %v", key, value, val, version, ver)
+	}
+}
+
+type ClntRes struct {
+	Nok    int
+	Nmaybe int
+}
+
+func (ts *Test) CheckPutConcurrent(ck IKVClerk, key string, rs []ClntRes, res *ClntRes) {
+	e := EntryV{}
+	ver0 := ts.GetJson(ck, key, -1, &e)
+	for _, r := range rs {
+		res.Nok += r.Nok
+		res.Nmaybe += r.Nmaybe
+	}
+	if !ts.IsReliable() && ver0 > rpc.Tversion(res.Nok+res.Nmaybe) {
+		ts.Fatalf("Wrong number of puts: server %d clnts %v", ver0, res)
+	}
+	if ts.IsReliable() && ver0 != rpc.Tversion(res.Nok) {
+		ts.Fatalf("Wrong number of puts: server %d clnts %v", ver0, res)
+	}
+}
+
+// a client runs the function f and then signals it is done
+func (ts *Test) runClient(me int, ca chan ClntRes, done chan struct{}, mkc IClerkMaker, fn Fclnt) {
+	ck := mkc.MakeClerk()
+	v := fn(me, ck, done)
+	ca <- v
+	mkc.DeleteClerk(ck)
+}
+
+type Fclnt func(int, IKVClerk, chan struct{}) ClntRes
+
+// spawn ncli clients
+func (ts *Test) SpawnClientsAndWait(nclnt int, t time.Duration, fn Fclnt) []ClntRes {
+	ca := make([]chan ClntRes, nclnt)
+	done := make(chan struct{})
+	for cli := 0; cli < nclnt; cli++ {
+		ca[cli] = make(chan ClntRes)
+		go ts.runClient(cli, ca[cli], done, ts.mck, fn)
+	}
+	time.Sleep(t)
+	for i := 0; i < nclnt; i++ {
+		done <- struct{}{}
+	}
+	rs := make([]ClntRes, nclnt)
+	for cli := 0; cli < nclnt; cli++ {
+		rs[cli] = <-ca[cli]
+	}
+	return rs
+}
+
+func (ts *Test) GetJson(ck IKVClerk, key string, me int, v any) rpc.Tversion {
+	if val, ver, err := Get(ts.Config, ck, key, ts.oplog, me); err == rpc.OK {
+		if err := json.Unmarshal([]byte(val), v); err != nil {
+			ts.Fatalf("Unmarshal err %v", ver)
+		}
+		return ver
+	} else {
+		ts.Fatalf("%d: Get %q err %v", me, key, err)
+		return 0
+	}
+}
+
+func (ts *Test) PutJson(ck IKVClerk, key string, v any, ver rpc.Tversion, me int) rpc.Err {
+	b, err := json.Marshal(v)
+	if err != nil {
+		ts.Fatalf("%d: marshal %v", me, err)
+	}
+	return Put(ts.Config, ck, key, string(b), ver, ts.oplog, me)
+}
+
+func (ts *Test) PutAtLeastOnceJson(ck IKVClerk, key string, value any, ver rpc.Tversion, me int) rpc.Tversion {
+	for true {
+		if err := ts.PutJson(ck, key, value, 0, me); err != rpc.ErrMaybe {
+			break
+		}
+		ver += 1
+	}
+	return ver
+}
+
+type EntryV struct {
+	Id int
+	V  rpc.Tversion
+}
+
+// Keep trying until we get one put succeeds while other clients
+// tryint to put to the same key
+func (ts *Test) OnePut(me int, ck IKVClerk, key string, ver rpc.Tversion) (rpc.Tversion, bool) {
+	for true {
+		err := ts.PutJson(ck, key, EntryV{me, ver}, ver, me)
+		if !(err == rpc.OK || err == rpc.ErrVersion || err == rpc.ErrMaybe) {
+			ts.Fatalf("Wrong error %v", err)
+		}
+		e := EntryV{}
+		ver0 := ts.GetJson(ck, key, me, &e)
+		if err == rpc.OK && ver0 == ver+1 { // my put?
+			if e.Id != me && e.V != ver {
+				ts.Fatalf("Wrong value %v", e)
+			}
+		}
+		ver = ver0
+		if err == rpc.OK || err == rpc.ErrMaybe {
+			return ver, err == rpc.OK
+		}
+	}
+	return 0, false
+}
+
+// repartition the servers periodically
+func (ts *Test) Partitioner(gid tester.Tgid, ch chan bool) {
+	defer func() { ch <- true }()
+	for true {
+		switch {
+		case <-ch:
+			return
+		default:
+			a := make([]int, ts.Group(gid).N())
+			for i := 0; i < ts.Group(gid).N(); i++ {
+				a[i] = (rand.Int() % 2)
+			}
+			pa := make([][]int, 2)
+			for i := 0; i < 2; i++ {
+				pa[i] = make([]int, 0)
+				for j := 0; j < ts.Group(gid).N(); j++ {
+					if a[j] == i {
+						pa[i] = append(pa[i], j)
+					}
+				}
+			}
+			ts.Group(gid).Partition(pa[0], pa[1])
+			time.Sleep(ElectionTimeout + time.Duration(rand.Int63()%200)*time.Millisecond)
+		}
+	}
+}
+
+// One of perhaps many clients doing OnePut's until done signal.
+func (ts *Test) OneClientPut(cli int, ck IKVClerk, ka []string, done chan struct{}) ClntRes {
+	res := ClntRes{}
+	verm := make(map[string]rpc.Tversion)
+	for _, k := range ka {
+		verm[k] = rpc.Tversion(0)
+	}
+	ok := false
+	for true {
+		select {
+		case <-done:
+			return res
+		default:
+			k := ka[0]
+			if ts.randomkeys {
+				k = ka[rand.Int()%len(ka)]
+			}
+			verm[k], ok = ts.OnePut(cli, ck, k, verm[k])
+			if ok {
+				res.Nok += 1
+			} else {
+				res.Nmaybe += 1
+			}
+		}
+	}
+	return res
+}
+
+func MakeKeys(n int) []string {
+	keys := make([]string, n)
+	for i := 0; i < n; i++ {
+		keys[i] = "k" + strconv.Itoa(i) // ensure multiple shards
+	}
+	return keys
+}
+
+func (ts *Test) SpreadPuts(ck IKVClerk, n int) ([]string, []string) {
+	ka := MakeKeys(n)
+	va := make([]string, n)
+	for i := 0; i < n; i++ {
+		va[i] = tester.Randstring(20)
+		ck.Put(ka[i], va[i], rpc.Tversion(0))
+	}
+	for i := 0; i < n; i++ {
+		ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1))
+	}
+	return ka, va
+}
+
+type entry struct {
+	Id int
+	N  int
+}
+
+// At each iteration i, oneClient attemps to appends a tuple (me, i)
+// to a key "k" shared with other clients.  The client implements the
+// append by first performing a Clerk.Get and then a Clerk.Put with
+// the version number returned from the Get.  If another client
+// performs an append between the Get and the Put, the clerk may
+// return ErrVersion and the client can retry.  If the clerk returns
+// ErrMaybe, the client's Put may have succeeded or not; in both
+// cases, the client moves to the next iteration.  When running with
+// many clients, the server's value for key "k" has the shape [(i, 1),
+// (i, 2), (j, 1), (j, 3)...]: that is, each client has entries with
+// increasing N, but may some Ns may have been skipped.
+func (ts *Test) OneClientAppend(me int, ck IKVClerk, done chan struct{}) ClntRes {
+	nmay := 0
+	nok := 0
+	for i := 0; true; i++ {
+		select {
+		case <-done:
+			return ClntRes{nok, nmay}
+		default:
+			// keep trying to put my i when err == ErrVersion
+			for true {
+				es := []entry{}
+				ver := ts.GetJson(ck, "k", me, &es)
+				es = append(es, entry{me, i})
+				if err := ts.PutJson(ck, "k", es, ver, me); err == rpc.OK {
+					nok += 1
+					break
+				} else if err == rpc.ErrMaybe {
+					// DPrintf("put %v err %v", ver, err)
+					nmay += 1
+					break
+				}
+			}
+		}
+	}
+	return ClntRes{}
+}
+
+type EntryN struct {
+	Id int
+	N  int
+}
+
+// check reads the latest value for key "k" and checks that it has the
+// correct tuples.
+func (ts *Test) CheckAppends(es []EntryN, nclnt int, rs []ClntRes, ver rpc.Tversion) {
+	expect := make(map[int]int)
+	skipped := make(map[int]int)
+	for i := 0; i < nclnt; i++ {
+		expect[i] = 0
+		skipped[i] = 0
+	}
+	for _, e := range es {
+		if expect[e.Id] > e.N { // old put?
+			ts.Fatalf("%d: wrong expecting %v but got %v", e.Id, expect[e.Id], e.N)
+		} else if expect[e.Id] == e.N {
+			expect[e.Id] += 1
+		} else { // missing entries because of failed put
+			s := (e.N - expect[e.Id])
+			expect[e.Id] = e.N + 1
+			skipped[e.Id] += s
+		}
+	}
+	if len(es)+1 != int(ver) {
+		ts.Fatalf("%d appends in val != puts on server %d", len(es), ver)
+	}
+	for c, n := range expect {
+		if skipped[c] > rs[c].Nmaybe {
+			ts.Fatalf("%d: skipped puts %d on server > %d maybe", c, skipped[c], rs[c].Nmaybe)
+		}
+		if n > rs[c].Nok+rs[c].Nmaybe {
+			ts.Fatalf("%d: %d puts on server > ok+maybe %d", c, n, rs[c].Nok+rs[c].Nmaybe)
+		}
+	}
+}
--- a/src/kvtest1/porcupine.go
+++ b/src/kvtest1/porcupine.go
@ -0,0 +1,150 @@
+package kvtest
+
+import (
+	"fmt"
+	"io/ioutil"
+	//"log"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/anishathalye/porcupine"
+
+	"6.5840/kvsrv1/rpc"
+	"6.5840/models1"
+	"6.5840/tester1"
+)
+
+const linearizabilityCheckTimeout = 1 * time.Second
+
+type OpLog struct {
+	operations []porcupine.Operation
+	sync.Mutex
+}
+
+func (log *OpLog) Len() int {
+	log.Lock()
+	defer log.Unlock()
+	return len(log.operations)
+}
+
+func (log *OpLog) Append(op porcupine.Operation) {
+	log.Lock()
+	defer log.Unlock()
+	log.operations = append(log.operations, op)
+}
+
+func (log *OpLog) Read() []porcupine.Operation {
+	log.Lock()
+	defer log.Unlock()
+	ops := make([]porcupine.Operation, len(log.operations))
+	copy(ops, log.operations)
+	return ops
+}
+
+// to make sure timestamps use the monotonic clock, instead of computing
+// absolute timestamps with `time.Now().UnixNano()` (which uses the wall
+// clock), we measure time relative to `t0` using `time.Since(t0)`, which uses
+// the monotonic clock
+var t0 = time.Now()
+
+func Get(cfg *tester.Config, ck IKVClerk, key string, log *OpLog, cli int) (string, rpc.Tversion, rpc.Err) {
+	start := int64(time.Since(t0))
+	val, ver, err := ck.Get(key)
+	end := int64(time.Since(t0))
+	cfg.Op()
+	if log != nil {
+		log.Append(porcupine.Operation{
+			Input:    models.KvInput{Op: 0, Key: key},
+			Output:   models.KvOutput{Value: val, Version: uint64(ver), Err: string(err)},
+			Call:     start,
+			Return:   end,
+			ClientId: cli,
+		})
+	}
+	return val, ver, err
+}
+
+func Put(cfg *tester.Config, ck IKVClerk, key string, value string, version rpc.Tversion, log *OpLog, cli int) rpc.Err {
+	start := int64(time.Since(t0))
+	err := ck.Put(key, value, version)
+	end := int64(time.Since(t0))
+	cfg.Op()
+	if log != nil {
+		log.Append(porcupine.Operation{
+			Input:    models.KvInput{Op: 1, Key: key, Value: value, Version: uint64(version)},
+			Output:   models.KvOutput{Err: string(err)},
+			Call:     start,
+			Return:   end,
+			ClientId: cli,
+		})
+	}
+	return err
+}
+
+// Checks that the log of Clerk.Put's and Clerk.Get's is linearizable (see
+// linearizability-faq.txt)
+func checkPorcupine(t *testing.T, opLog *OpLog, nsec time.Duration) {
+	//log.Printf("oplog len %v %v", ts.oplog.Len(), ts.oplog)
+	res, info := porcupine.CheckOperationsVerbose(models.KvModel, opLog.Read(), nsec)
+	if res == porcupine.Illegal {
+		file, err := ioutil.TempFile("", "porcupine-*.html")
+		if err != nil {
+			fmt.Printf("info: failed to create temp file for visualization")
+		} else {
+			err = porcupine.Visualize(models.KvModel, info, file)
+			if err != nil {
+				fmt.Printf("info: failed to write history visualization to %s\n", file.Name())
+			} else {
+				fmt.Printf("info: wrote history visualization to %s\n", file.Name())
+			}
+		}
+		t.Fatal("history is not linearizable")
+	} else if res == porcupine.Unknown {
+		fmt.Println("info: linearizability check timed out, assuming history is ok")
+	}
+}
+
+// Porcupine
+func (ts *Test) Get(ck IKVClerk, key string, cli int) (string, rpc.Tversion, rpc.Err) {
+	start := int64(time.Since(t0))
+	val, ver, err := ck.Get(key)
+	end := int64(time.Since(t0))
+	ts.Op()
+	if ts.oplog != nil {
+		ts.oplog.Append(porcupine.Operation{
+			Input:    models.KvInput{Op: 0, Key: key},
+			Output:   models.KvOutput{Value: val, Version: uint64(ver), Err: string(err)},
+			Call:     start,
+			Return:   end,
+			ClientId: cli,
+		})
+	}
+	return val, ver, err
+}
+
+// Porcupine
+func (ts *Test) Put(ck IKVClerk, key string, value string, version rpc.Tversion, cli int) rpc.Err {
+	start := int64(time.Since(t0))
+	err := ck.Put(key, value, version)
+	end := int64(time.Since(t0))
+	ts.Op()
+	if ts.oplog != nil {
+		ts.oplog.Append(porcupine.Operation{
+			Input:    models.KvInput{Op: 1, Key: key, Value: value, Version: uint64(version)},
+			Output:   models.KvOutput{Err: string(err)},
+			Call:     start,
+			Return:   end,
+			ClientId: cli,
+		})
+	}
+	return err
+}
+
+func (ts *Test) CheckPorcupine() {
+	checkPorcupine(ts.t, ts.oplog, linearizabilityCheckTimeout)
+}
+
+func (ts *Test) CheckPorcupineT(nsec time.Duration) {
+	checkPorcupine(ts.t, ts.oplog, nsec)
+}
--- a/src/labgob/labgob.go
+++ b/src/labgob/labgob.go
@ -0,0 +1,177 @@
+package labgob
+
+//
+// trying to send non-capitalized fields over RPC produces a range of
+// misbehavior, including both mysterious incorrect computation and
+// outright crashes. so this wrapper around Go's encoding/gob warns
+// about non-capitalized field names.
+//
+
+import "encoding/gob"
+import "io"
+import "reflect"
+import "fmt"
+import "sync"
+import "unicode"
+import "unicode/utf8"
+
+var mu sync.Mutex
+var errorCount int // for TestCapital
+var checked map[reflect.Type]bool
+
+type LabEncoder struct {
+	gob *gob.Encoder
+}
+
+func NewEncoder(w io.Writer) *LabEncoder {
+	enc := &LabEncoder{}
+	enc.gob = gob.NewEncoder(w)
+	return enc
+}
+
+func (enc *LabEncoder) Encode(e interface{}) error {
+	checkValue(e)
+	return enc.gob.Encode(e)
+}
+
+func (enc *LabEncoder) EncodeValue(value reflect.Value) error {
+	checkValue(value.Interface())
+	return enc.gob.EncodeValue(value)
+}
+
+type LabDecoder struct {
+	gob *gob.Decoder
+}
+
+func NewDecoder(r io.Reader) *LabDecoder {
+	dec := &LabDecoder{}
+	dec.gob = gob.NewDecoder(r)
+	return dec
+}
+
+func (dec *LabDecoder) Decode(e interface{}) error {
+	checkValue(e)
+	checkDefault(e)
+	return dec.gob.Decode(e)
+}
+
+func Register(value interface{}) {
+	checkValue(value)
+	gob.Register(value)
+}
+
+func RegisterName(name string, value interface{}) {
+	checkValue(value)
+	gob.RegisterName(name, value)
+}
+
+func checkValue(value interface{}) {
+	checkType(reflect.TypeOf(value))
+}
+
+func checkType(t reflect.Type) {
+	k := t.Kind()
+
+	mu.Lock()
+	// only complain once, and avoid recursion.
+	if checked == nil {
+		checked = map[reflect.Type]bool{}
+	}
+	if checked[t] {
+		mu.Unlock()
+		return
+	}
+	checked[t] = true
+	mu.Unlock()
+
+	switch k {
+	case reflect.Struct:
+		for i := 0; i < t.NumField(); i++ {
+			f := t.Field(i)
+			rune, _ := utf8.DecodeRuneInString(f.Name)
+			if unicode.IsUpper(rune) == false {
+				// ta da
+				fmt.Printf("labgob error: lower-case field %v of %v in RPC or persist/snapshot will break your Raft\n",
+					f.Name, t.Name())
+				mu.Lock()
+				errorCount += 1
+				mu.Unlock()
+			}
+			checkType(f.Type)
+		}
+		return
+	case reflect.Slice, reflect.Array, reflect.Ptr:
+		checkType(t.Elem())
+		return
+	case reflect.Map:
+		checkType(t.Elem())
+		checkType(t.Key())
+		return
+	default:
+		return
+	}
+}
+
+//
+// warn if the value contains non-default values,
+// as it would if one sent an RPC but the reply
+// struct was already modified. if the RPC reply
+// contains default values, GOB won't overwrite
+// the non-default value.
+//
+func checkDefault(value interface{}) {
+	if value == nil {
+		return
+	}
+	checkDefault1(reflect.ValueOf(value), 1, "")
+}
+
+func checkDefault1(value reflect.Value, depth int, name string) {
+	if depth > 3 {
+		return
+	}
+
+	t := value.Type()
+	k := t.Kind()
+
+	switch k {
+	case reflect.Struct:
+		for i := 0; i < t.NumField(); i++ {
+			vv := value.Field(i)
+			name1 := t.Field(i).Name
+			if name != "" {
+				name1 = name + "." + name1
+			}
+			checkDefault1(vv, depth+1, name1)
+		}
+		return
+	case reflect.Ptr:
+		if value.IsNil() {
+			return
+		}
+		checkDefault1(value.Elem(), depth+1, name)
+		return
+	case reflect.Bool,
+		reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64,
+		reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64,
+		reflect.Uintptr, reflect.Float32, reflect.Float64,
+		reflect.String:
+		if reflect.DeepEqual(reflect.Zero(t).Interface(), value.Interface()) == false {
+			mu.Lock()
+			if errorCount < 1 {
+				what := name
+				if what == "" {
+					what = t.Name()
+				}
+				// this warning typically arises if code re-uses the same RPC reply
+				// variable for multiple RPC calls, or if code restores persisted
+				// state into variable that already have non-default values.
+				fmt.Printf("labgob warning: Decoding into a non-default variable/field %v may not work\n",
+					what)
+			}
+			errorCount += 1
+			mu.Unlock()
+		}
+		return
+	}
+}
--- a/src/labgob/test_test.go
+++ b/src/labgob/test_test.go
@ -0,0 +1,166 @@
+package labgob
+
+import "testing"
+
+import "bytes"
+
+type T1 struct {
+	T1int0    int
+	T1int1    int
+	T1string0 string
+	T1string1 string
+}
+
+type T2 struct {
+	T2slice []T1
+	T2map   map[int]*T1
+	T2t3    interface{}
+}
+
+type T3 struct {
+	T3int999 int
+}
+
+// test that we didn't break GOB.
+func TestGOB(t *testing.T) {
+	e0 := errorCount
+
+	w := new(bytes.Buffer)
+
+	Register(T3{})
+
+	{
+		x0 := 0
+		x1 := 1
+		t1 := T1{}
+		t1.T1int1 = 1
+		t1.T1string1 = "6.5840"
+		t2 := T2{}
+		t2.T2slice = []T1{T1{}, t1}
+		t2.T2map = map[int]*T1{}
+		t2.T2map[99] = &T1{1, 2, "x", "y"}
+		t2.T2t3 = T3{999}
+
+		e := NewEncoder(w)
+		e.Encode(x0)
+		e.Encode(x1)
+		e.Encode(t1)
+		e.Encode(t2)
+	}
+	data := w.Bytes()
+
+	{
+		var x0 int
+		var x1 int
+		var t1 T1
+		var t2 T2
+
+		r := bytes.NewBuffer(data)
+		d := NewDecoder(r)
+		if d.Decode(&x0) != nil ||
+			d.Decode(&x1) != nil ||
+			d.Decode(&t1) != nil ||
+			d.Decode(&t2) != nil {
+			t.Fatalf("Decode failed")
+		}
+
+		if x0 != 0 {
+			t.Fatalf("wrong x0 %v\n", x0)
+		}
+		if x1 != 1 {
+			t.Fatalf("wrong x1 %v\n", x1)
+		}
+		if t1.T1int0 != 0 {
+			t.Fatalf("wrong t1.T1int0 %v\n", t1.T1int0)
+		}
+		if t1.T1int1 != 1 {
+			t.Fatalf("wrong t1.T1int1 %v\n", t1.T1int1)
+		}
+		if t1.T1string0 != "" {
+			t.Fatalf("wrong t1.T1string0 %v\n", t1.T1string0)
+		}
+		if t1.T1string1 != "6.5840" {
+			t.Fatalf("wrong t1.T1string1 %v\n", t1.T1string1)
+		}
+		if len(t2.T2slice) != 2 {
+			t.Fatalf("wrong t2.T2slice len %v\n", len(t2.T2slice))
+		}
+		if t2.T2slice[1].T1int1 != 1 {
+			t.Fatalf("wrong slice value\n")
+		}
+		if len(t2.T2map) != 1 {
+			t.Fatalf("wrong t2.T2map len %v\n", len(t2.T2map))
+		}
+		if t2.T2map[99].T1string1 != "y" {
+			t.Fatalf("wrong map value\n")
+		}
+		t3 := (t2.T2t3).(T3)
+		if t3.T3int999 != 999 {
+			t.Fatalf("wrong t2.T2t3.T3int999\n")
+		}
+	}
+
+	if errorCount != e0 {
+		t.Fatalf("there were errors, but should not have been")
+	}
+}
+
+type T4 struct {
+	Yes int
+	no  int
+}
+
+// make sure we check capitalization
+// labgob prints one warning during this test.
+func TestCapital(t *testing.T) {
+	e0 := errorCount
+
+	v := []map[*T4]int{}
+
+	w := new(bytes.Buffer)
+	e := NewEncoder(w)
+	e.Encode(v)
+	data := w.Bytes()
+
+	var v1 []map[T4]int
+	r := bytes.NewBuffer(data)
+	d := NewDecoder(r)
+	d.Decode(&v1)
+
+	if errorCount != e0+1 {
+		t.Fatalf("failed to warn about lower-case field")
+	}
+}
+
+// check that we warn when someone sends a default value over
+// RPC but the target into which we're decoding holds a non-default
+// value, which GOB seems not to overwrite as you'd expect.
+//
+// labgob does not print a warning.
+func TestDefault(t *testing.T) {
+	e0 := errorCount
+
+	type DD struct {
+		X int
+	}
+
+	// send a default value...
+	dd1 := DD{}
+
+	w := new(bytes.Buffer)
+	e := NewEncoder(w)
+	e.Encode(dd1)
+	data := w.Bytes()
+
+	// and receive it into memory that already
+	// holds non-default values.
+	reply := DD{99}
+
+	r := bytes.NewBuffer(data)
+	d := NewDecoder(r)
+	d.Decode(&reply)
+
+	if errorCount != e0+1 {
+		t.Fatalf("failed to warn about decoding into non-default value")
+	}
+}
--- a/src/labrpc/labrpc.go
+++ b/src/labrpc/labrpc.go
@ -0,0 +1,536 @@
+package labrpc
+
+//
+// channel-based RPC, for 6.5840 labs.
+//
+// simulates a network that can lose requests, lose replies,
+// delay messages, and entirely disconnect particular hosts.
+//
+// we will use the original labrpc.go to test your code for grading.
+// so, while you can modify this code to help you debug, please
+// test against the original before submitting.
+//
+// adapted from Go net/rpc/server.go.
+//
+// sends labgob-encoded values to ensure that RPCs
+// don't include references to program objects.
+//
+// net := MakeNetwork() -- holds network, clients, servers.
+// end := net.MakeEnd(endname) -- create a client end-point, to talk to one server.
+// net.AddServer(servername, server) -- adds a named server to network.
+// net.DeleteServer(servername) -- eliminate the named server.
+// net.Connect(endname, servername) -- connect a client to a server.
+// net.Enable(endname, enabled) -- enable/disable a client.
+// net.Reliable(bool) -- false means drop/delay messages
+//
+// end.Call("Raft.AppendEntries", &args, &reply) -- send an RPC, wait for reply.
+// the "Raft" is the name of the server struct to be called.
+// the "AppendEntries" is the name of the method to be called.
+// Call() returns true to indicate that the server executed the request
+// and the reply is valid.
+// Call() returns false if the network lost the request or reply
+// or the server is down.
+// It is OK to have multiple Call()s in progress at the same time on the
+// same ClientEnd.
+// Concurrent calls to Call() may be delivered to the server out of order,
+// since the network may re-order messages.
+// Call() is guaranteed to return (perhaps after a delay) *except* if the
+// handler function on the server side does not return.
+// the server RPC handler function must declare its args and reply arguments
+// as pointers, so that their types exactly match the types of the arguments
+// to Call().
+//
+// srv := MakeServer()
+// srv.AddService(svc) -- a server can have multiple services, e.g. Raft and k/v
+//   pass srv to net.AddServer()
+//
+// svc := MakeService(receiverObject) -- obj's methods will handle RPCs
+//   much like Go's rpcs.Register()
+//   pass svc to srv.AddService()
+//
+
+import "6.5840/labgob"
+import "bytes"
+import "reflect"
+import "sync"
+import "log"
+import "strings"
+import "math/rand"
+import "time"
+import "sync/atomic"
+
+const (
+	SHORTDELAY = 27   // ms
+	LONGDELAY  = 7000 // ms
+	MAXDELAY   = LONGDELAY + 100
+)
+
+type reqMsg struct {
+	endname  interface{} // name of sending ClientEnd
+	svcMeth  string      // e.g. "Raft.AppendEntries"
+	argsType reflect.Type
+	args     []byte
+	replyCh  chan replyMsg
+}
+
+type replyMsg struct {
+	ok    bool
+	reply []byte
+}
+
+type ClientEnd struct {
+	endname interface{}   // this end-point's name
+	ch      chan reqMsg   // copy of Network.endCh
+	done    chan struct{} // closed when Network is cleaned up
+}
+
+// send an RPC, wait for the reply.
+// the return value indicates success; false means that
+// no reply was received from the server.
+func (e *ClientEnd) Call(svcMeth string, args interface{}, reply interface{}) bool {
+	req := reqMsg{}
+	req.endname = e.endname
+	req.svcMeth = svcMeth
+	req.argsType = reflect.TypeOf(args)
+	req.replyCh = make(chan replyMsg)
+
+	qb := new(bytes.Buffer)
+	qe := labgob.NewEncoder(qb)
+	if err := qe.Encode(args); err != nil {
+		panic(err)
+	}
+	req.args = qb.Bytes()
+
+	//
+	// send the request.
+	//
+	select {
+	case e.ch <- req:
+		// the request has been sent.
+	case <-e.done:
+		// entire Network has been destroyed.
+		return false
+	}
+
+	//
+	// wait for the reply.
+	//
+	rep := <-req.replyCh
+	if rep.ok {
+		rb := bytes.NewBuffer(rep.reply)
+		rd := labgob.NewDecoder(rb)
+		if err := rd.Decode(reply); err != nil {
+			log.Fatalf("ClientEnd.Call(): decode reply: %v\n", err)
+		}
+		return true
+	} else {
+		return false
+	}
+}
+
+type Network struct {
+	mu             sync.Mutex
+	reliable       bool
+	longDelays     bool                        // pause a long time on send on disabled connection
+	longReordering bool                        // sometimes delay replies a long time
+	ends           map[interface{}]*ClientEnd  // ends, by name
+	enabled        map[interface{}]bool        // by end name
+	servers        map[interface{}]*Server     // servers, by name
+	connections    map[interface{}]interface{} // endname -> servername
+	endCh          chan reqMsg
+	done           chan struct{} // closed when Network is cleaned up
+	count          int32         // total RPC count, for statistics
+	bytes          int64         // total bytes send, for statistics
+}
+
+func MakeNetwork() *Network {
+	rn := &Network{}
+	rn.reliable = true
+	rn.ends = map[interface{}]*ClientEnd{}
+	rn.enabled = map[interface{}]bool{}
+	rn.servers = map[interface{}]*Server{}
+	rn.connections = map[interface{}](interface{}){}
+	rn.endCh = make(chan reqMsg)
+	rn.done = make(chan struct{})
+
+	// single goroutine to handle all ClientEnd.Call()s
+	go func() {
+		for {
+			select {
+			case xreq := <-rn.endCh:
+				atomic.AddInt32(&rn.count, 1)
+				atomic.AddInt64(&rn.bytes, int64(len(xreq.args)))
+				go rn.processReq(xreq)
+			case <-rn.done:
+				return
+			}
+		}
+	}()
+
+	return rn
+}
+
+func (rn *Network) Cleanup() {
+	close(rn.done)
+}
+
+func (rn *Network) Reliable(yes bool) {
+	rn.mu.Lock()
+	defer rn.mu.Unlock()
+
+	rn.reliable = yes
+}
+
+func (rn *Network) IsReliable() bool {
+	rn.mu.Lock()
+	defer rn.mu.Unlock()
+
+	return rn.reliable
+}
+
+func (rn *Network) LongReordering(yes bool) {
+	rn.mu.Lock()
+	defer rn.mu.Unlock()
+
+	rn.longReordering = yes
+}
+
+func (rn *Network) LongDelays(yes bool) {
+	rn.mu.Lock()
+	defer rn.mu.Unlock()
+
+	rn.longDelays = yes
+}
+
+func (rn *Network) readEndnameInfo(endname interface{}) (enabled bool,
+	servername interface{}, server *Server, reliable bool, longreordering bool,
+) {
+	rn.mu.Lock()
+	defer rn.mu.Unlock()
+
+	enabled = rn.enabled[endname]
+	servername = rn.connections[endname]
+	if servername != nil {
+		server = rn.servers[servername]
+	}
+	reliable = rn.reliable
+	longreordering = rn.longReordering
+	return
+}
+
+func (rn *Network) isServerDead(endname interface{}, servername interface{}, server *Server) bool {
+	rn.mu.Lock()
+	defer rn.mu.Unlock()
+
+	if rn.enabled[endname] == false || rn.servers[servername] != server {
+		return true
+	}
+	return false
+}
+
+func (rn *Network) processReq(req reqMsg) {
+	enabled, servername, server, reliable, longreordering := rn.readEndnameInfo(req.endname)
+
+	if enabled && servername != nil && server != nil {
+		if reliable == false {
+			// short delay
+			ms := (rand.Int() % SHORTDELAY)
+			time.Sleep(time.Duration(ms) * time.Millisecond)
+		}
+
+		if reliable == false && (rand.Int()%1000) < 100 {
+			// drop the request, return as if timeout
+			req.replyCh <- replyMsg{false, nil}
+			return
+		}
+
+		// execute the request (call the RPC handler).
+		// in a separate thread so that we can periodically check
+		// if the server has been killed and the RPC should get a
+		// failure reply.
+		ech := make(chan replyMsg)
+		go func() {
+			r := server.dispatch(req)
+			ech <- r
+		}()
+
+		// wait for handler to return,
+		// but stop waiting if DeleteServer() has been called,
+		// and return an error.
+		var reply replyMsg
+		replyOK := false
+		serverDead := false
+		for replyOK == false && serverDead == false {
+			select {
+			case reply = <-ech:
+				replyOK = true
+			case <-time.After(100 * time.Millisecond):
+				serverDead = rn.isServerDead(req.endname, servername, server)
+				if serverDead {
+					go func() {
+						<-ech // drain channel to let the goroutine created earlier terminate
+					}()
+				}
+			}
+		}
+
+		// do not reply if DeleteServer() has been called, i.e.
+		// the server has been killed. this is needed to avoid
+		// situation in which a client gets a positive reply
+		// to an Append, but the server persisted the update
+		// into the old Persister. config.go is careful to call
+		// DeleteServer() before superseding the Persister.
+		serverDead = rn.isServerDead(req.endname, servername, server)
+
+		if replyOK == false || serverDead == true {
+			// server was killed while we were waiting; return error.
+			req.replyCh <- replyMsg{false, nil}
+		} else if reliable == false && (rand.Int()%1000) < 100 {
+			// drop the reply, return as if timeout
+			req.replyCh <- replyMsg{false, nil}
+		} else if longreordering == true && rand.Intn(900) < 600 {
+			// delay the response for a while
+			ms := 200 + rand.Intn(1+rand.Intn(2000))
+			// Russ points out that this timer arrangement will decrease
+			// the number of goroutines, so that the race
+			// detector is less likely to get upset.
+			time.AfterFunc(time.Duration(ms)*time.Millisecond, func() {
+				atomic.AddInt64(&rn.bytes, int64(len(reply.reply)))
+				req.replyCh <- reply
+			})
+		} else {
+			atomic.AddInt64(&rn.bytes, int64(len(reply.reply)))
+			req.replyCh <- reply
+		}
+	} else {
+		// simulate no reply and eventual timeout.
+		ms := 0
+		if rn.longDelays {
+			// let Raft tests check that leader doesn't send
+			// RPCs synchronously.
+			ms = (rand.Int() % LONGDELAY)
+		} else {
+			// many kv tests require the client to try each
+			// server in fairly rapid succession.
+			ms = (rand.Int() % 100)
+		}
+		time.AfterFunc(time.Duration(ms)*time.Millisecond, func() {
+			req.replyCh <- replyMsg{false, nil}
+		})
+	}
+
+}
+
+// create a client end-point.
+// start the thread that listens and delivers.
+func (rn *Network) MakeEnd(endname interface{}) *ClientEnd {
+	rn.mu.Lock()
+	defer rn.mu.Unlock()
+
+	if _, ok := rn.ends[endname]; ok {
+		log.Fatalf("MakeEnd: %v already exists\n", endname)
+	}
+
+	e := &ClientEnd{}
+	e.endname = endname
+	e.ch = rn.endCh
+	e.done = rn.done
+	rn.ends[endname] = e
+	rn.enabled[endname] = false
+	rn.connections[endname] = nil
+
+	return e
+}
+
+func (rn *Network) DeleteEnd(endname interface{}) {
+	rn.mu.Lock()
+	defer rn.mu.Unlock()
+
+	if _, ok := rn.ends[endname]; !ok {
+		log.Fatalf("MakeEnd: %v doesn't exists\n", endname)
+	}
+	delete(rn.ends, endname)
+	delete(rn.enabled, endname)
+	delete(rn.connections, endname)
+}
+
+func (rn *Network) AddServer(servername interface{}, rs *Server) {
+	rn.mu.Lock()
+	defer rn.mu.Unlock()
+
+	rn.servers[servername] = rs
+}
+
+func (rn *Network) DeleteServer(servername interface{}) {
+	rn.mu.Lock()
+	defer rn.mu.Unlock()
+
+	rn.servers[servername] = nil
+}
+
+// connect a ClientEnd to a server.
+// a ClientEnd can only be connected once in its lifetime.
+func (rn *Network) Connect(endname interface{}, servername interface{}) {
+	rn.mu.Lock()
+	defer rn.mu.Unlock()
+
+	rn.connections[endname] = servername
+}
+
+// enable/disable a ClientEnd.
+func (rn *Network) Enable(endname interface{}, enabled bool) {
+	rn.mu.Lock()
+	defer rn.mu.Unlock()
+
+	rn.enabled[endname] = enabled
+}
+
+// get a server's count of incoming RPCs.
+func (rn *Network) GetCount(servername interface{}) int {
+	rn.mu.Lock()
+	defer rn.mu.Unlock()
+
+	svr := rn.servers[servername]
+	return svr.GetCount()
+}
+
+func (rn *Network) GetTotalCount() int {
+	x := atomic.LoadInt32(&rn.count)
+	return int(x)
+}
+
+func (rn *Network) GetTotalBytes() int64 {
+	x := atomic.LoadInt64(&rn.bytes)
+	return x
+}
+
+// a server is a collection of services, all sharing
+// the same rpc dispatcher. so that e.g. both a Raft
+// and a k/v server can listen to the same rpc endpoint.
+type Server struct {
+	mu       sync.Mutex
+	services map[string]*Service
+	count    int // incoming RPCs
+}
+
+func MakeServer() *Server {
+	rs := &Server{}
+	rs.services = map[string]*Service{}
+	return rs
+}
+
+func (rs *Server) AddService(svc *Service) {
+	rs.mu.Lock()
+	defer rs.mu.Unlock()
+	rs.services[svc.name] = svc
+}
+
+func (rs *Server) dispatch(req reqMsg) replyMsg {
+	rs.mu.Lock()
+
+	rs.count += 1
+
+	// split Raft.AppendEntries into service and method
+	dot := strings.LastIndex(req.svcMeth, ".")
+	serviceName := req.svcMeth[:dot]
+	methodName := req.svcMeth[dot+1:]
+
+	service, ok := rs.services[serviceName]
+
+	rs.mu.Unlock()
+
+	if ok {
+		return service.dispatch(methodName, req)
+	} else {
+		choices := []string{}
+		for k, _ := range rs.services {
+			choices = append(choices, k)
+		}
+		log.Fatalf("labrpc.Server.dispatch(): unknown service %v in %v.%v; expecting one of %v\n",
+			serviceName, serviceName, methodName, choices)
+		return replyMsg{false, nil}
+	}
+}
+
+func (rs *Server) GetCount() int {
+	rs.mu.Lock()
+	defer rs.mu.Unlock()
+	return rs.count
+}
+
+// an object with methods that can be called via RPC.
+// a single server may have more than one Service.
+type Service struct {
+	name    string
+	rcvr    reflect.Value
+	typ     reflect.Type
+	methods map[string]reflect.Method
+}
+
+func MakeService(rcvr interface{}) *Service {
+	svc := &Service{}
+	svc.typ = reflect.TypeOf(rcvr)
+	svc.rcvr = reflect.ValueOf(rcvr)
+	svc.name = reflect.Indirect(svc.rcvr).Type().Name()
+	svc.methods = map[string]reflect.Method{}
+
+	for m := 0; m < svc.typ.NumMethod(); m++ {
+		method := svc.typ.Method(m)
+		mtype := method.Type
+		mname := method.Name
+
+		//fmt.Printf("%v pp %v ni %v 1k %v 2k %v no %v\n",
+		//	mname, method.PkgPath, mtype.NumIn(), mtype.In(1).Kind(), mtype.In(2).Kind(), mtype.NumOut())
+
+		if method.PkgPath != "" || // capitalized?
+			mtype.NumIn() != 3 ||
+			//mtype.In(1).Kind() != reflect.Ptr ||
+			mtype.In(2).Kind() != reflect.Ptr ||
+			mtype.NumOut() != 0 {
+			// the method is not suitable for a handler
+			//fmt.Printf("bad method: %v\n", mname)
+		} else {
+			// the method looks like a handler
+			svc.methods[mname] = method
+		}
+	}
+
+	return svc
+}
+
+func (svc *Service) dispatch(methname string, req reqMsg) replyMsg {
+	if method, ok := svc.methods[methname]; ok {
+		// prepare space into which to read the argument.
+		// the Value's type will be a pointer to req.argsType.
+		args := reflect.New(req.argsType)
+
+		// decode the argument.
+		ab := bytes.NewBuffer(req.args)
+		ad := labgob.NewDecoder(ab)
+		ad.Decode(args.Interface())
+
+		// allocate space for the reply.
+		replyType := method.Type.In(2)
+		replyType = replyType.Elem()
+		replyv := reflect.New(replyType)
+
+		// call the method.
+		function := method.Func
+		function.Call([]reflect.Value{svc.rcvr, args.Elem(), replyv})
+
+		// encode the reply.
+		rb := new(bytes.Buffer)
+		re := labgob.NewEncoder(rb)
+		re.EncodeValue(replyv)
+
+		return replyMsg{true, rb.Bytes()}
+	} else {
+		choices := []string{}
+		for k, _ := range svc.methods {
+			choices = append(choices, k)
+		}
+		log.Fatalf("labrpc.Service.dispatch(): unknown method %v in %v; expecting one of %v\n",
+			methname, req.svcMeth, choices)
+		return replyMsg{false, nil}
+	}
+}
--- a/src/labrpc/test_test.go
+++ b/src/labrpc/test_test.go
@ -0,0 +1,597 @@
+package labrpc
+
+import "testing"
+import "strconv"
+import "sync"
+import "runtime"
+import "time"
+import "fmt"
+
+type JunkArgs struct {
+	X int
+}
+type JunkReply struct {
+	X string
+}
+
+type JunkServer struct {
+	mu   sync.Mutex
+	log1 []string
+	log2 []int
+}
+
+func (js *JunkServer) Handler1(args string, reply *int) {
+	js.mu.Lock()
+	defer js.mu.Unlock()
+	js.log1 = append(js.log1, args)
+	*reply, _ = strconv.Atoi(args)
+}
+
+func (js *JunkServer) Handler2(args int, reply *string) {
+	js.mu.Lock()
+	defer js.mu.Unlock()
+	js.log2 = append(js.log2, args)
+	*reply = "handler2-" + strconv.Itoa(args)
+}
+
+func (js *JunkServer) Handler3(args int, reply *int) {
+	js.mu.Lock()
+	defer js.mu.Unlock()
+	time.Sleep(20 * time.Second)
+	*reply = -args
+}
+
+// args is a pointer
+func (js *JunkServer) Handler4(args *JunkArgs, reply *JunkReply) {
+	reply.X = "pointer"
+}
+
+// args is a not pointer
+func (js *JunkServer) Handler5(args JunkArgs, reply *JunkReply) {
+	reply.X = "no pointer"
+}
+
+func (js *JunkServer) Handler6(args string, reply *int) {
+	js.mu.Lock()
+	defer js.mu.Unlock()
+	*reply = len(args)
+}
+
+func (js *JunkServer) Handler7(args int, reply *string) {
+	js.mu.Lock()
+	defer js.mu.Unlock()
+	*reply = ""
+	for i := 0; i < args; i++ {
+		*reply = *reply + "y"
+	}
+}
+
+func TestBasic(t *testing.T) {
+	runtime.GOMAXPROCS(4)
+
+	rn := MakeNetwork()
+	defer rn.Cleanup()
+
+	e := rn.MakeEnd("end1-99")
+
+	js := &JunkServer{}
+	svc := MakeService(js)
+
+	rs := MakeServer()
+	rs.AddService(svc)
+	rn.AddServer("server99", rs)
+
+	rn.Connect("end1-99", "server99")
+	rn.Enable("end1-99", true)
+
+	{
+		reply := ""
+		e.Call("JunkServer.Handler2", 111, &reply)
+		if reply != "handler2-111" {
+			t.Fatalf("wrong reply from Handler2")
+		}
+	}
+
+	{
+		reply := 0
+		e.Call("JunkServer.Handler1", "9099", &reply)
+		if reply != 9099 {
+			t.Fatalf("wrong reply from Handler1")
+		}
+	}
+}
+
+func TestTypes(t *testing.T) {
+	runtime.GOMAXPROCS(4)
+
+	rn := MakeNetwork()
+	defer rn.Cleanup()
+
+	e := rn.MakeEnd("end1-99")
+
+	js := &JunkServer{}
+	svc := MakeService(js)
+
+	rs := MakeServer()
+	rs.AddService(svc)
+	rn.AddServer("server99", rs)
+
+	rn.Connect("end1-99", "server99")
+	rn.Enable("end1-99", true)
+
+	{
+		var args JunkArgs
+		var reply JunkReply
+		// args must match type (pointer or not) of handler.
+		e.Call("JunkServer.Handler4", &args, &reply)
+		if reply.X != "pointer" {
+			t.Fatalf("wrong reply from Handler4")
+		}
+	}
+
+	{
+		var args JunkArgs
+		var reply JunkReply
+		// args must match type (pointer or not) of handler.
+		e.Call("JunkServer.Handler5", args, &reply)
+		if reply.X != "no pointer" {
+			t.Fatalf("wrong reply from Handler5")
+		}
+	}
+}
+
+//
+// does net.Enable(endname, false) really disconnect a client?
+//
+func TestDisconnect(t *testing.T) {
+	runtime.GOMAXPROCS(4)
+
+	rn := MakeNetwork()
+	defer rn.Cleanup()
+
+	e := rn.MakeEnd("end1-99")
+
+	js := &JunkServer{}
+	svc := MakeService(js)
+
+	rs := MakeServer()
+	rs.AddService(svc)
+	rn.AddServer("server99", rs)
+
+	rn.Connect("end1-99", "server99")
+
+	{
+		reply := ""
+		e.Call("JunkServer.Handler2", 111, &reply)
+		if reply != "" {
+			t.Fatalf("unexpected reply from Handler2")
+		}
+	}
+
+	rn.Enable("end1-99", true)
+
+	{
+		reply := 0
+		e.Call("JunkServer.Handler1", "9099", &reply)
+		if reply != 9099 {
+			t.Fatalf("wrong reply from Handler1")
+		}
+	}
+}
+
+//
+// test net.GetCount()
+//
+func TestCounts(t *testing.T) {
+	runtime.GOMAXPROCS(4)
+
+	rn := MakeNetwork()
+	defer rn.Cleanup()
+
+	e := rn.MakeEnd("end1-99")
+
+	js := &JunkServer{}
+	svc := MakeService(js)
+
+	rs := MakeServer()
+	rs.AddService(svc)
+	rn.AddServer(99, rs)
+
+	rn.Connect("end1-99", 99)
+	rn.Enable("end1-99", true)
+
+	for i := 0; i < 17; i++ {
+		reply := ""
+		e.Call("JunkServer.Handler2", i, &reply)
+		wanted := "handler2-" + strconv.Itoa(i)
+		if reply != wanted {
+			t.Fatalf("wrong reply %v from Handler1, expecting %v", reply, wanted)
+		}
+	}
+
+	n := rn.GetCount(99)
+	if n != 17 {
+		t.Fatalf("wrong GetCount() %v, expected 17\n", n)
+	}
+}
+
+//
+// test net.GetTotalBytes()
+//
+func TestBytes(t *testing.T) {
+	runtime.GOMAXPROCS(4)
+
+	rn := MakeNetwork()
+	defer rn.Cleanup()
+
+	e := rn.MakeEnd("end1-99")
+
+	js := &JunkServer{}
+	svc := MakeService(js)
+
+	rs := MakeServer()
+	rs.AddService(svc)
+	rn.AddServer(99, rs)
+
+	rn.Connect("end1-99", 99)
+	rn.Enable("end1-99", true)
+
+	for i := 0; i < 17; i++ {
+		args := "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+		args = args + args
+		args = args + args
+		reply := 0
+		e.Call("JunkServer.Handler6", args, &reply)
+		wanted := len(args)
+		if reply != wanted {
+			t.Fatalf("wrong reply %v from Handler6, expecting %v", reply, wanted)
+		}
+	}
+
+	n := rn.GetTotalBytes()
+	if n < 4828 || n > 6000 {
+		t.Fatalf("wrong GetTotalBytes() %v, expected about 5000\n", n)
+	}
+
+	for i := 0; i < 17; i++ {
+		args := 107
+		reply := ""
+		e.Call("JunkServer.Handler7", args, &reply)
+		wanted := args
+		if len(reply) != wanted {
+			t.Fatalf("wrong reply len=%v from Handler6, expecting %v", len(reply), wanted)
+		}
+	}
+
+	nn := rn.GetTotalBytes() - n
+	if nn < 1800 || nn > 2500 {
+		t.Fatalf("wrong GetTotalBytes() %v, expected about 2000\n", nn)
+	}
+}
+
+//
+// test RPCs from concurrent ClientEnds
+//
+func TestConcurrentMany(t *testing.T) {
+	runtime.GOMAXPROCS(4)
+
+	rn := MakeNetwork()
+	defer rn.Cleanup()
+
+	js := &JunkServer{}
+	svc := MakeService(js)
+
+	rs := MakeServer()
+	rs.AddService(svc)
+	rn.AddServer(1000, rs)
+
+	ch := make(chan int)
+
+	nclients := 20
+	nrpcs := 10
+	for ii := 0; ii < nclients; ii++ {
+		go func(i int) {
+			n := 0
+			defer func() { ch <- n }()
+
+			e := rn.MakeEnd(i)
+			rn.Connect(i, 1000)
+			rn.Enable(i, true)
+
+			for j := 0; j < nrpcs; j++ {
+				arg := i*100 + j
+				reply := ""
+				e.Call("JunkServer.Handler2", arg, &reply)
+				wanted := "handler2-" + strconv.Itoa(arg)
+				if reply != wanted {
+					t.Fatalf("wrong reply %v from Handler1, expecting %v", reply, wanted)
+				}
+				n += 1
+			}
+		}(ii)
+	}
+
+	total := 0
+	for ii := 0; ii < nclients; ii++ {
+		x := <-ch
+		total += x
+	}
+
+	if total != nclients*nrpcs {
+		t.Fatalf("wrong number of RPCs completed, got %v, expected %v", total, nclients*nrpcs)
+	}
+
+	n := rn.GetCount(1000)
+	if n != total {
+		t.Fatalf("wrong GetCount() %v, expected %v\n", n, total)
+	}
+}
+
+//
+// test unreliable
+//
+func TestUnreliable(t *testing.T) {
+	runtime.GOMAXPROCS(4)
+
+	rn := MakeNetwork()
+	defer rn.Cleanup()
+	rn.Reliable(false)
+
+	js := &JunkServer{}
+	svc := MakeService(js)
+
+	rs := MakeServer()
+	rs.AddService(svc)
+	rn.AddServer(1000, rs)
+
+	ch := make(chan int)
+
+	nclients := 300
+	for ii := 0; ii < nclients; ii++ {
+		go func(i int) {
+			n := 0
+			defer func() { ch <- n }()
+
+			e := rn.MakeEnd(i)
+			rn.Connect(i, 1000)
+			rn.Enable(i, true)
+
+			arg := i * 100
+			reply := ""
+			ok := e.Call("JunkServer.Handler2", arg, &reply)
+			if ok {
+				wanted := "handler2-" + strconv.Itoa(arg)
+				if reply != wanted {
+					t.Fatalf("wrong reply %v from Handler1, expecting %v", reply, wanted)
+				}
+				n += 1
+			}
+		}(ii)
+	}
+
+	total := 0
+	for ii := 0; ii < nclients; ii++ {
+		x := <-ch
+		total += x
+	}
+
+	if total == nclients || total == 0 {
+		t.Fatalf("all RPCs succeeded despite unreliable")
+	}
+}
+
+//
+// test concurrent RPCs from a single ClientEnd
+//
+func TestConcurrentOne(t *testing.T) {
+	runtime.GOMAXPROCS(4)
+
+	rn := MakeNetwork()
+	defer rn.Cleanup()
+
+	js := &JunkServer{}
+	svc := MakeService(js)
+
+	rs := MakeServer()
+	rs.AddService(svc)
+	rn.AddServer(1000, rs)
+
+	e := rn.MakeEnd("c")
+	rn.Connect("c", 1000)
+	rn.Enable("c", true)
+
+	ch := make(chan int)
+
+	nrpcs := 20
+	for ii := 0; ii < nrpcs; ii++ {
+		go func(i int) {
+			n := 0
+			defer func() { ch <- n }()
+
+			arg := 100 + i
+			reply := ""
+			e.Call("JunkServer.Handler2", arg, &reply)
+			wanted := "handler2-" + strconv.Itoa(arg)
+			if reply != wanted {
+				t.Fatalf("wrong reply %v from Handler2, expecting %v", reply, wanted)
+			}
+			n += 1
+		}(ii)
+	}
+
+	total := 0
+	for ii := 0; ii < nrpcs; ii++ {
+		x := <-ch
+		total += x
+	}
+
+	if total != nrpcs {
+		t.Fatalf("wrong number of RPCs completed, got %v, expected %v", total, nrpcs)
+	}
+
+	js.mu.Lock()
+	defer js.mu.Unlock()
+	if len(js.log2) != nrpcs {
+		t.Fatalf("wrong number of RPCs delivered")
+	}
+
+	n := rn.GetCount(1000)
+	if n != total {
+		t.Fatalf("wrong GetCount() %v, expected %v\n", n, total)
+	}
+}
+
+//
+// regression: an RPC that's delayed during Enabled=false
+// should not delay subsequent RPCs (e.g. after Enabled=true).
+//
+func TestRegression1(t *testing.T) {
+	runtime.GOMAXPROCS(4)
+
+	rn := MakeNetwork()
+	defer rn.Cleanup()
+
+	js := &JunkServer{}
+	svc := MakeService(js)
+
+	rs := MakeServer()
+	rs.AddService(svc)
+	rn.AddServer(1000, rs)
+
+	e := rn.MakeEnd("c")
+	rn.Connect("c", 1000)
+
+	// start some RPCs while the ClientEnd is disabled.
+	// they'll be delayed.
+	rn.Enable("c", false)
+	ch := make(chan bool)
+	nrpcs := 20
+	for ii := 0; ii < nrpcs; ii++ {
+		go func(i int) {
+			ok := false
+			defer func() { ch <- ok }()
+
+			arg := 100 + i
+			reply := ""
+			// this call ought to return false.
+			e.Call("JunkServer.Handler2", arg, &reply)
+			ok = true
+		}(ii)
+	}
+
+	time.Sleep(100 * time.Millisecond)
+
+	// now enable the ClientEnd and check that an RPC completes quickly.
+	t0 := time.Now()
+	rn.Enable("c", true)
+	{
+		arg := 99
+		reply := ""
+		e.Call("JunkServer.Handler2", arg, &reply)
+		wanted := "handler2-" + strconv.Itoa(arg)
+		if reply != wanted {
+			t.Fatalf("wrong reply %v from Handler2, expecting %v", reply, wanted)
+		}
+	}
+	dur := time.Since(t0).Seconds()
+
+	if dur > 0.03 {
+		t.Fatalf("RPC took too long (%v) after Enable", dur)
+	}
+
+	for ii := 0; ii < nrpcs; ii++ {
+		<-ch
+	}
+
+	js.mu.Lock()
+	defer js.mu.Unlock()
+	if len(js.log2) != 1 {
+		t.Fatalf("wrong number (%v) of RPCs delivered, expected 1", len(js.log2))
+	}
+
+	n := rn.GetCount(1000)
+	if n != 1 {
+		t.Fatalf("wrong GetCount() %v, expected %v\n", n, 1)
+	}
+}
+
+//
+// if an RPC is stuck in a server, and the server
+// is killed with DeleteServer(), does the RPC
+// get un-stuck?
+//
+func TestKilled(t *testing.T) {
+	runtime.GOMAXPROCS(4)
+
+	rn := MakeNetwork()
+	defer rn.Cleanup()
+
+	e := rn.MakeEnd("end1-99")
+
+	js := &JunkServer{}
+	svc := MakeService(js)
+
+	rs := MakeServer()
+	rs.AddService(svc)
+	rn.AddServer("server99", rs)
+
+	rn.Connect("end1-99", "server99")
+	rn.Enable("end1-99", true)
+
+	doneCh := make(chan bool)
+	go func() {
+		reply := 0
+		ok := e.Call("JunkServer.Handler3", 99, &reply)
+		doneCh <- ok
+	}()
+
+	time.Sleep(1000 * time.Millisecond)
+
+	select {
+	case <-doneCh:
+		t.Fatalf("Handler3 should not have returned yet")
+	case <-time.After(100 * time.Millisecond):
+	}
+
+	rn.DeleteServer("server99")
+
+	select {
+	case x := <-doneCh:
+		if x != false {
+			t.Fatalf("Handler3 returned successfully despite DeleteServer()")
+		}
+	case <-time.After(100 * time.Millisecond):
+		t.Fatalf("Handler3 should return after DeleteServer()")
+	}
+}
+
+func TestBenchmark(t *testing.T) {
+	runtime.GOMAXPROCS(4)
+
+	rn := MakeNetwork()
+	defer rn.Cleanup()
+
+	e := rn.MakeEnd("end1-99")
+
+	js := &JunkServer{}
+	svc := MakeService(js)
+
+	rs := MakeServer()
+	rs.AddService(svc)
+	rn.AddServer("server99", rs)
+
+	rn.Connect("end1-99", "server99")
+	rn.Enable("end1-99", true)
+
+	t0 := time.Now()
+	n := 100000
+	for iters := 0; iters < n; iters++ {
+		reply := ""
+		e.Call("JunkServer.Handler2", 111, &reply)
+		if reply != "handler2-111" {
+			t.Fatalf("wrong reply from Handler2")
+		}
+	}
+	fmt.Printf("%v for %v\n", time.Since(t0), n)
+	// march 2016, rtm laptop, 22 microseconds per RPC
+}
--- a/src/main/diskvd.go
+++ b/src/main/diskvd.go
@ -0,0 +1,74 @@
+package main
+
+//
+// start a diskvd server. it's a member of some replica
+// group, which has other members, and it needs to know
+// how to talk to the members of the shardmaster service.
+// used by ../diskv/test_test.go
+//
+// arguments:
+//   -g groupid
+//   -m masterport1 -m masterport2 ...
+//   -s replicaport1 -s replicaport2 ...
+//   -i my-index-in-server-port-list
+//   -u unreliable
+//   -d directory
+//   -r restart
+
+import "time"
+import "6.5840/diskv"
+import "os"
+import "fmt"
+import "strconv"
+import "runtime"
+
+func usage() {
+	fmt.Printf("Usage: diskvd -g gid -m master... -s server... -i my-index -d dir\n")
+	os.Exit(1)
+}
+
+func main() {
+	var gid int64 = -1     // my replica group ID
+	masters := []string{}  // ports of shardmasters
+	replicas := []string{} // ports of servers in my replica group
+	me := -1               // my index in replicas[]
+	unreliable := false
+	dir := "" // store persistent data here
+	restart := false
+
+	for i := 1; i+1 < len(os.Args); i += 2 {
+		a0 := os.Args[i]
+		a1 := os.Args[i+1]
+		if a0 == "-g" {
+			gid, _ = strconv.ParseInt(a1, 10, 64)
+		} else if a0 == "-m" {
+			masters = append(masters, a1)
+		} else if a0 == "-s" {
+			replicas = append(replicas, a1)
+		} else if a0 == "-i" {
+			me, _ = strconv.Atoi(a1)
+		} else if a0 == "-u" {
+			unreliable, _ = strconv.ParseBool(a1)
+		} else if a0 == "-d" {
+			dir = a1
+		} else if a0 == "-r" {
+			restart, _ = strconv.ParseBool(a1)
+		} else {
+			usage()
+		}
+	}
+
+	if gid < 0 || me < 0 || len(masters) < 1 || me >= len(replicas) || dir == "" {
+		usage()
+	}
+
+	runtime.GOMAXPROCS(4)
+
+	srv := diskv.StartServer(gid, masters, replicas, me, dir, restart)
+	srv.Setunreliable(unreliable)
+
+	// for safety, force quit after 10 minutes.
+	time.Sleep(10 * 60 * time.Second)
+	mep, _ := os.FindProcess(os.Getpid())
+	mep.Kill()
+}
--- a/src/main/lockc.go
+++ b/src/main/lockc.go
@ -0,0 +1,31 @@
+package main
+
+//
+// see comments in lockd.go
+//
+
+import "6.5840/lockservice"
+import "os"
+import "fmt"
+
+func usage() {
+	fmt.Printf("Usage: lockc -l|-u primaryport backupport lockname\n")
+	os.Exit(1)
+}
+
+func main() {
+	if len(os.Args) == 5 {
+		ck := lockservice.MakeClerk(os.Args[2], os.Args[3])
+		var ok bool
+		if os.Args[1] == "-l" {
+			ok = ck.Lock(os.Args[4])
+		} else if os.Args[1] == "-u" {
+			ok = ck.Unlock(os.Args[4])
+		} else {
+			usage()
+		}
+		fmt.Printf("reply: %v\n", ok)
+	} else {
+		usage()
+	}
+}
--- a/src/main/lockd.go
+++ b/src/main/lockd.go
@ -0,0 +1,31 @@
+package main
+
+// export GOPATH=~/6.5840
+// go build lockd.go
+// go build lockc.go
+// ./lockd -p a b &
+// ./lockd -b a b &
+// ./lockc -l a b lx
+// ./lockc -u a b lx
+//
+// on Athena, use /tmp/myname-a and /tmp/myname-b
+// instead of a and b.
+
+import "time"
+import "6.5840/lockservice"
+import "os"
+import "fmt"
+
+func main() {
+	if len(os.Args) == 4 && os.Args[1] == "-p" {
+		lockservice.StartServer(os.Args[2], os.Args[3], true)
+	} else if len(os.Args) == 4 && os.Args[1] == "-b" {
+		lockservice.StartServer(os.Args[2], os.Args[3], false)
+	} else {
+		fmt.Printf("Usage: lockd -p|-b primaryport backupport\n")
+		os.Exit(1)
+	}
+	for {
+		time.Sleep(100 * time.Second)
+	}
+}
--- a/src/main/mrcoordinator.go
+++ b/src/main/mrcoordinator.go
@ -0,0 +1,29 @@
+package main
+
+//
+// start the coordinator process, which is implemented
+// in ../mr/coordinator.go
+//
+// go run mrcoordinator.go pg*.txt
+//
+// Please do not change this file.
+//
+
+import "6.5840/mr"
+import "time"
+import "os"
+import "fmt"
+
+func main() {
+	if len(os.Args) < 2 {
+		fmt.Fprintf(os.Stderr, "Usage: mrcoordinator inputfiles...\n")
+		os.Exit(1)
+	}
+
+	m := mr.MakeCoordinator(os.Args[1:], 10)
+	for m.Done() == false {
+		time.Sleep(time.Second)
+	}
+
+	time.Sleep(time.Second)
+}
--- a/src/main/mrsequential.go
+++ b/src/main/mrsequential.go
@ -0,0 +1,108 @@
+package main
+
+//
+// simple sequential MapReduce.
+//
+// go run mrsequential.go wc.so pg*.txt
+//
+
+import "fmt"
+import "6.5840/mr"
+import "plugin"
+import "os"
+import "log"
+import "io/ioutil"
+import "sort"
+
+// for sorting by key.
+type ByKey []mr.KeyValue
+
+// for sorting by key.
+func (a ByKey) Len() int           { return len(a) }
+func (a ByKey) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
+func (a ByKey) Less(i, j int) bool { return a[i].Key < a[j].Key }
+
+func main() {
+	if len(os.Args) < 3 {
+		fmt.Fprintf(os.Stderr, "Usage: mrsequential xxx.so inputfiles...\n")
+		os.Exit(1)
+	}
+
+	mapf, reducef := loadPlugin(os.Args[1])
+
+	//
+	// read each input file,
+	// pass it to Map,
+	// accumulate the intermediate Map output.
+	//
+	intermediate := []mr.KeyValue{}
+	for _, filename := range os.Args[2:] {
+		file, err := os.Open(filename)
+		if err != nil {
+			log.Fatalf("cannot open %v", filename)
+		}
+		content, err := ioutil.ReadAll(file)
+		if err != nil {
+			log.Fatalf("cannot read %v", filename)
+		}
+		file.Close()
+		kva := mapf(filename, string(content))
+		intermediate = append(intermediate, kva...)
+	}
+
+	//
+	// a big difference from real MapReduce is that all the
+	// intermediate data is in one place, intermediate[],
+	// rather than being partitioned into NxM buckets.
+	//
+
+	sort.Sort(ByKey(intermediate))
+
+	oname := "mr-out-0"
+	ofile, _ := os.Create(oname)
+
+	//
+	// call Reduce on each distinct key in intermediate[],
+	// and print the result to mr-out-0.
+	//
+	i := 0
+	for i < len(intermediate) {
+		j := i + 1
+		for j < len(intermediate) && intermediate[j].Key == intermediate[i].Key {
+			j++
+		}
+		values := []string{}
+		for k := i; k < j; k++ {
+			values = append(values, intermediate[k].Value)
+		}
+		output := reducef(intermediate[i].Key, values)
+
+		// this is the correct format for each line of Reduce output.
+		fmt.Fprintf(ofile, "%v %v\n", intermediate[i].Key, output)
+
+		i = j
+	}
+
+	ofile.Close()
+}
+
+// load the application Map and Reduce functions
+// from a plugin file, e.g. ../mrapps/wc.so
+func loadPlugin(filename string) (func(string, string) []mr.KeyValue, func(string, []string) string) {
+	p, err := plugin.Open(filename)
+	if err != nil {
+		log.Fatalf("cannot load plugin %v", filename)
+	}
+	xmapf, err := p.Lookup("Map")
+	if err != nil {
+		log.Fatalf("cannot find Map in %v", filename)
+	}
+	mapf := xmapf.(func(string, string) []mr.KeyValue)
+	xreducef, err := p.Lookup("Reduce")
+	if err != nil {
+		log.Fatalf("cannot find Reduce in %v", filename)
+	}
+	reducef := xreducef.(func(string, []string) string)
+
+	return mapf, reducef
+}
--- a/src/main/mrworker.go
+++ b/src/main/mrworker.go
@ -0,0 +1,49 @@
+package main
+
+//
+// start a worker process, which is implemented
+// in ../mr/worker.go. typically there will be
+// multiple worker processes, talking to one coordinator.
+//
+// go run mrworker.go wc.so
+//
+// Please do not change this file.
+//
+
+import "6.5840/mr"
+import "plugin"
+import "os"
+import "fmt"
+import "log"
+
+func main() {
+	if len(os.Args) != 2 {
+		fmt.Fprintf(os.Stderr, "Usage: mrworker xxx.so\n")
+		os.Exit(1)
+	}
+
+	mapf, reducef := loadPlugin(os.Args[1])
+
+	mr.Worker(mapf, reducef)
+}
+
+// load the application Map and Reduce functions
+// from a plugin file, e.g. ../mrapps/wc.so
+func loadPlugin(filename string) (func(string, string) []mr.KeyValue, func(string, []string) string) {
+	p, err := plugin.Open(filename)
+	if err != nil {
+		log.Fatalf("cannot load plugin %v", filename)
+	}
+	xmapf, err := p.Lookup("Map")
+	if err != nil {
+		log.Fatalf("cannot find Map in %v", filename)
+	}
+	mapf := xmapf.(func(string, string) []mr.KeyValue)
+	xreducef, err := p.Lookup("Reduce")
+	if err != nil {
+		log.Fatalf("cannot find Reduce in %v", filename)
+	}
+	reducef := xreducef.(func(string, []string) string)
+
+	return mapf, reducef
+}
--- a/src/main/pbc.go
+++ b/src/main/pbc.go
@ -0,0 +1,44 @@
+package main
+
+//
+// pbservice client application
+//
+// export GOPATH=~/6.5840
+// go build viewd.go
+// go build pbd.go
+// go build pbc.go
+// ./viewd /tmp/rtm-v &
+// ./pbd /tmp/rtm-v /tmp/rtm-1 &
+// ./pbd /tmp/rtm-v /tmp/rtm-2 &
+// ./pbc /tmp/rtm-v key1 value1
+// ./pbc /tmp/rtm-v key1
+//
+// change "rtm" to your user name.
+// start the pbd programs in separate windows and kill
+// and restart them to exercise fault tolerance.
+//
+
+import "6.5840/pbservice"
+import "os"
+import "fmt"
+
+func usage() {
+	fmt.Printf("Usage: pbc viewport key\n")
+	fmt.Printf("       pbc viewport key value\n")
+	os.Exit(1)
+}
+
+func main() {
+	if len(os.Args) == 3 {
+		// get
+		ck := pbservice.MakeClerk(os.Args[1], "")
+		v := ck.Get(os.Args[2])
+		fmt.Printf("%v\n", v)
+	} else if len(os.Args) == 4 {
+		// put
+		ck := pbservice.MakeClerk(os.Args[1], "")
+		ck.Put(os.Args[2], os.Args[3])
+	} else {
+		usage()
+	}
+}
--- a/src/main/pbd.go
+++ b/src/main/pbd.go
@ -0,0 +1,23 @@
+package main
+
+//
+// see directions in pbc.go
+//
+
+import "time"
+import "6.5840/pbservice"
+import "os"
+import "fmt"
+
+func main() {
+	if len(os.Args) != 3 {
+		fmt.Printf("Usage: pbd viewport myport\n")
+		os.Exit(1)
+	}
+
+	pbservice.StartServer(os.Args[1], os.Args[2])
+
+	for {
+		time.Sleep(100 * time.Second)
+	}
+}
--- a/src/main/pg-being_ernest.txt
+++ b/src/main/pg-being_ernest.txt
--- a/src/main/pg-dorian_gray.txt
+++ b/src/main/pg-dorian_gray.txt
--- a/src/main/pg-frankenstein.txt
+++ b/src/main/pg-frankenstein.txt
--- a/src/main/pg-grimm.txt
+++ b/src/main/pg-grimm.txt
--- a/src/main/pg-huckleberry_finn.txt
+++ b/src/main/pg-huckleberry_finn.txt
--- a/src/main/pg-metamorphosis.txt
+++ b/src/main/pg-metamorphosis.txt
--- a/src/main/pg-sherlock_holmes.txt
+++ b/src/main/pg-sherlock_holmes.txt
--- a/src/main/pg-tom_sawyer.txt
+++ b/src/main/pg-tom_sawyer.txt
--- a/src/main/test-mr-many.sh
+++ b/src/main/test-mr-many.sh
@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+if [ $# -ne 1 ]; then
+    echo "Usage: $0 numTrials"
+    exit 1
+fi
+
+trap 'kill -INT -$pid; exit 1' INT
+
+# Note: because the socketID is based on the current userID,
+# ./test-mr.sh cannot be run in parallel
+runs=$1
+chmod +x test-mr.sh
+
+for i in $(seq 1 $runs); do
+    timeout -k 2s 900s ./test-mr.sh &
+    pid=$!
+    if ! wait $pid; then
+        echo '***' FAILED TESTS IN TRIAL $i
+        exit 1
+    fi
+done
+echo '***' PASSED ALL $i TESTING TRIALS
--- a/src/main/test-mr.sh
+++ b/src/main/test-mr.sh
@ -0,0 +1,338 @@
+#!/usr/bin/env bash
+
+#
+# map-reduce tests
+#
+
+# un-comment this to run the tests with the Go race detector.
+# RACE=-race
+
+if [[ "$OSTYPE" = "darwin"* ]]
+then
+  if go version | grep 'go1.17.[012345]'
+  then
+    # -race with plug-ins on x86 MacOS 12 with
+    # go1.17 before 1.17.6 sometimes crash.
+    RACE=
+    echo '*** Turning off -race since it may not work on a Mac'
+    echo '    with ' `go version`
+  fi
+fi
+
+ISQUIET=$1
+maybe_quiet() {
+    if [ "$ISQUIET" == "quiet" ]; then
+      "$@" > /dev/null 2>&1
+    else
+      "$@"
+    fi
+}
+
+
+TIMEOUT=timeout
+TIMEOUT2=""
+if timeout 2s sleep 1 > /dev/null 2>&1
+then
+  :
+else
+  if gtimeout 2s sleep 1 > /dev/null 2>&1
+  then
+    TIMEOUT=gtimeout
+  else
+    # no timeout command
+    TIMEOUT=
+    echo '*** Cannot find timeout command; proceeding without timeouts.'
+  fi
+fi
+if [ "$TIMEOUT" != "" ]
+then
+  TIMEOUT2=$TIMEOUT
+  TIMEOUT2+=" -k 2s 120s "
+  TIMEOUT+=" -k 2s 45s "
+fi
+
+# run the test in a fresh sub-directory.
+rm -rf mr-tmp
+mkdir mr-tmp || exit 1
+cd mr-tmp || exit 1
+rm -f mr-*
+
+# make sure software is freshly built.
+(cd ../../mrapps && go clean)
+(cd .. && go clean)
+(cd ../../mrapps && go build $RACE -buildmode=plugin wc.go) || exit 1
+(cd ../../mrapps && go build $RACE -buildmode=plugin indexer.go) || exit 1
+(cd ../../mrapps && go build $RACE -buildmode=plugin mtiming.go) || exit 1
+(cd ../../mrapps && go build $RACE -buildmode=plugin rtiming.go) || exit 1
+(cd ../../mrapps && go build $RACE -buildmode=plugin jobcount.go) || exit 1
+(cd ../../mrapps && go build $RACE -buildmode=plugin early_exit.go) || exit 1
+(cd ../../mrapps && go build $RACE -buildmode=plugin crash.go) || exit 1
+(cd ../../mrapps && go build $RACE -buildmode=plugin nocrash.go) || exit 1
+(cd .. && go build $RACE mrcoordinator.go) || exit 1
+(cd .. && go build $RACE mrworker.go) || exit 1
+(cd .. && go build $RACE mrsequential.go) || exit 1
+
+failed_any=0
+
+#########################################################
+# first word-count
+
+# generate the correct output
+../mrsequential ../../mrapps/wc.so ../pg*txt || exit 1
+sort mr-out-0 > mr-correct-wc.txt
+rm -f mr-out*
+
+echo '***' Starting wc test.
+
+maybe_quiet $TIMEOUT ../mrcoordinator ../pg*txt &
+pid=$!
+
+# give the coordinator time to create the sockets.
+sleep 1
+
+# start multiple workers.
+(maybe_quiet $TIMEOUT ../mrworker ../../mrapps/wc.so) &
+(maybe_quiet $TIMEOUT ../mrworker ../../mrapps/wc.so) &
+(maybe_quiet $TIMEOUT ../mrworker ../../mrapps/wc.so) &
+
+# wait for the coordinator to exit.
+wait $pid
+
+# since workers are required to exit when a job is completely finished,
+# and not before, that means the job has finished.
+sort mr-out* | grep . > mr-wc-all
+if cmp mr-wc-all mr-correct-wc.txt
+then
+  echo '---' wc test: PASS
+else
+  echo '---' wc output is not the same as mr-correct-wc.txt
+  echo '---' wc test: FAIL
+  failed_any=1
+fi
+
+# wait for remaining workers and coordinator to exit.
+wait
+
+#########################################################
+# now indexer
+rm -f mr-*
+
+# generate the correct output
+../mrsequential ../../mrapps/indexer.so ../pg*txt || exit 1
+sort mr-out-0 > mr-correct-indexer.txt
+rm -f mr-out*
+
+echo '***' Starting indexer test.
+
+maybe_quiet $TIMEOUT ../mrcoordinator ../pg*txt &
+sleep 1
+
+# start multiple workers
+maybe_quiet $TIMEOUT ../mrworker ../../mrapps/indexer.so &
+maybe_quiet $TIMEOUT ../mrworker ../../mrapps/indexer.so
+
+sort mr-out* | grep . > mr-indexer-all
+if cmp mr-indexer-all mr-correct-indexer.txt
+then
+  echo '---' indexer test: PASS
+else
+  echo '---' indexer output is not the same as mr-correct-indexer.txt
+  echo '---' indexer test: FAIL
+  failed_any=1
+fi
+
+wait
+
+#########################################################
+echo '***' Starting map parallelism test.
+
+rm -f mr-*
+
+maybe_quiet $TIMEOUT ../mrcoordinator ../pg*txt &
+sleep 1
+
+maybe_quiet $TIMEOUT ../mrworker ../../mrapps/mtiming.so &
+maybe_quiet $TIMEOUT ../mrworker ../../mrapps/mtiming.so
+
+NT=`cat mr-out* | grep '^times-' | wc -l | sed 's/ //g'`
+if [ "$NT" != "2" ]
+then
+  echo '---' saw "$NT" workers rather than 2
+  echo '---' map parallelism test: FAIL
+  failed_any=1
+fi
+
+if cat mr-out* | grep '^parallel.* 2' > /dev/null
+then
+  echo '---' map parallelism test: PASS
+else
+  echo '---' map workers did not run in parallel
+  echo '---' map parallelism test: FAIL
+  failed_any=1
+fi
+
+wait
+
+
+#########################################################
+echo '***' Starting reduce parallelism test.
+
+rm -f mr-*
+
+maybe_quiet $TIMEOUT ../mrcoordinator ../pg*txt &
+sleep 1
+
+maybe_quiet $TIMEOUT ../mrworker ../../mrapps/rtiming.so  &
+maybe_quiet $TIMEOUT ../mrworker ../../mrapps/rtiming.so
+
+NT=`cat mr-out* | grep '^[a-z] 2' | wc -l | sed 's/ //g'`
+if [ "$NT" -lt "2" ]
+then
+  echo '---' too few parallel reduces.
+  echo '---' reduce parallelism test: FAIL
+  failed_any=1
+else
+  echo '---' reduce parallelism test: PASS
+fi
+
+wait
+
+#########################################################
+echo '***' Starting job count test.
+
+rm -f mr-*
+
+maybe_quiet $TIMEOUT ../mrcoordinator ../pg*txt  &
+sleep 1
+
+maybe_quiet $TIMEOUT ../mrworker ../../mrapps/jobcount.so &
+maybe_quiet $TIMEOUT ../mrworker ../../mrapps/jobcount.so
+maybe_quiet $TIMEOUT ../mrworker ../../mrapps/jobcount.so &
+maybe_quiet $TIMEOUT ../mrworker ../../mrapps/jobcount.so
+
+NT=`cat mr-out* | awk '{print $2}'`
+if [ "$NT" -eq "8" ]
+then
+  echo '---' job count test: PASS
+else
+  echo '---' map jobs ran incorrect number of times "($NT != 8)"
+  echo '---' job count test: FAIL
+  failed_any=1
+fi
+
+wait
+
+#########################################################
+# test whether any worker or coordinator exits before the
+# task has completed (i.e., all output files have been finalized)
+rm -f mr-*
+
+echo '***' Starting early exit test.
+
+DF=anydone$$
+rm -f $DF
+
+(maybe_quiet $TIMEOUT ../mrcoordinator ../pg*txt; touch $DF) &
+
+# give the coordinator time to create the sockets.
+sleep 1
+
+# start multiple workers.
+(maybe_quiet $TIMEOUT ../mrworker ../../mrapps/early_exit.so; touch $DF) &
+(maybe_quiet $TIMEOUT ../mrworker ../../mrapps/early_exit.so; touch $DF) &
+(maybe_quiet $TIMEOUT ../mrworker ../../mrapps/early_exit.so; touch $DF) &
+
+# wait for any of the coord or workers to exit.
+# `jobs` ensures that any completed old processes from other tests
+# are not waited upon.
+jobs &> /dev/null
+if [[ "$OSTYPE" = "darwin"* ]]
+then
+  # bash on the Mac doesn't have wait -n
+  while [ ! -e $DF ]
+  do
+    sleep 0.2
+  done
+else
+  # the -n causes wait to wait for just one child process,
+  # rather than waiting for all to finish.
+  wait -n
+fi
+
+rm -f $DF
+
+# a process has exited. this means that the output should be finalized
+# otherwise, either a worker or the coordinator exited early
+sort mr-out* | grep . > mr-wc-all-initial
+
+# wait for remaining workers and coordinator to exit.
+wait
+
+# compare initial and final outputs
+sort mr-out* | grep . > mr-wc-all-final
+if cmp mr-wc-all-final mr-wc-all-initial
+then
+  echo '---' early exit test: PASS
+else
+  echo '---' output changed after first worker exited
+  echo '---' early exit test: FAIL
+  failed_any=1
+fi
+rm -f mr-*
+
+#########################################################
+echo '***' Starting crash test.
+
+# generate the correct output
+../mrsequential ../../mrapps/nocrash.so ../pg*txt || exit 1
+sort mr-out-0 > mr-correct-crash.txt
+rm -f mr-out*
+
+rm -f mr-done
+((maybe_quiet $TIMEOUT2 ../mrcoordinator ../pg*txt); touch mr-done ) &
+sleep 1
+
+# start multiple workers
+maybe_quiet $TIMEOUT2 ../mrworker ../../mrapps/crash.so &
+
+# mimic rpc.go's coordinatorSock()
+SOCKNAME=/var/tmp/5840-mr-`id -u`
+
+( while [ -e $SOCKNAME -a ! -f mr-done ]
+  do
+    maybe_quiet $TIMEOUT2 ../mrworker ../../mrapps/crash.so
+    sleep 1
+  done ) &
+
+( while [ -e $SOCKNAME -a ! -f mr-done ]
+  do
+    maybe_quiet $TIMEOUT2 ../mrworker ../../mrapps/crash.so
+    sleep 1
+  done ) &
+
+while [ -e $SOCKNAME -a ! -f mr-done ]
+do
+  maybe_quiet $TIMEOUT2 ../mrworker ../../mrapps/crash.so
+  sleep 1
+done
+
+wait
+
+rm $SOCKNAME
+sort mr-out* | grep . > mr-crash-all
+if cmp mr-crash-all mr-correct-crash.txt
+then
+  echo '---' crash test: PASS
+else
+  echo '---' crash output is not the same as mr-correct-crash.txt
+  echo '---' crash test: FAIL
+  failed_any=1
+fi
+
+#########################################################
+if [ $failed_any -eq 0 ]; then
+    echo '***' PASSED ALL TESTS
+else
+    echo '***' FAILED SOME TESTS
+    exit 1
+fi
--- a/src/main/viewd.go
+++ b/src/main/viewd.go
@ -0,0 +1,23 @@
+package main
+
+//
+// see directions in pbc.go
+//
+
+import "time"
+import "6.5840/viewservice"
+import "os"
+import "fmt"
+
+func main() {
+	if len(os.Args) != 2 {
+		fmt.Printf("Usage: viewd port\n")
+		os.Exit(1)
+	}
+
+	viewservice.StartServer(os.Args[1])
+
+	for {
+		time.Sleep(100 * time.Second)
+	}
+}
--- a/src/models1/kv.go
+++ b/src/models1/kv.go
@ -0,0 +1,80 @@
+package models
+
+import "github.com/anishathalye/porcupine"
+
+import "fmt"
+import "sort"
+
+type KvInput struct {
+	Op      uint8 // 0 => get, 1 => put
+	Key     string
+	Value   string
+	Version uint64
+}
+
+type KvOutput struct {
+	Value   string
+	Version uint64
+	Err     string
+}
+
+type KvState struct {
+	Value   string
+	Version uint64
+}
+
+var KvModel = porcupine.Model{
+	Partition: func(history []porcupine.Operation) [][]porcupine.Operation {
+		m := make(map[string][]porcupine.Operation)
+		for _, v := range history {
+			key := v.Input.(KvInput).Key
+			m[key] = append(m[key], v)
+		}
+		keys := make([]string, 0, len(m))
+		for k := range m {
+			keys = append(keys, k)
+		}
+		sort.Strings(keys)
+		ret := make([][]porcupine.Operation, 0, len(keys))
+		for _, k := range keys {
+			ret = append(ret, m[k])
+		}
+		return ret
+	},
+	Init: func() interface{} {
+		// note: we are modeling a single key's value here;
+		// we're partitioning by key, so this is okay
+		return KvState{"", 0}
+	},
+	Step: func(state, input, output interface{}) (bool, interface{}) {
+		inp := input.(KvInput)
+		out := output.(KvOutput)
+		st := state.(KvState)
+		switch inp.Op {
+		case 0:
+			// get
+			return out.Value == st.Value, state
+		case 1:
+			// put
+			if st.Version == inp.Version {
+				return out.Err == "OK" || out.Err == "ErrMaybe", KvState{inp.Value, st.Version + 1}
+			} else {
+				return out.Err == "ErrVersion" || out.Err == "ErrMaybe", st
+			}
+		default:
+			return false, "<invalid>"
+		}
+	},
+	DescribeOperation: func(input, output interface{}) string {
+		inp := input.(KvInput)
+		out := output.(KvOutput)
+		switch inp.Op {
+		case 0:
+			return fmt.Sprintf("get('%s') -> ('%s', '%d', '%s')", inp.Key, out.Value, out.Version, out.Err)
+		case 1:
+			return fmt.Sprintf("put('%s', '%s', '%d') -> ('%s')", inp.Key, inp.Value, inp.Version, out.Err)
+		default:
+			return "<invalid>"
+		}
+	},
+}
--- a/src/mr/coordinator.go
+++ b/src/mr/coordinator.go
@ -0,0 +1,70 @@
+package mr
+
+import "log"
+import "net"
+import "os"
+import "net/rpc"
+import "net/http"
+
+
+type Coordinator struct {
+	// Your definitions here.
+
+}
+
+// Your code here -- RPC handlers for the worker to call.
+
+//
+// an example RPC handler.
+//
+// the RPC argument and reply types are defined in rpc.go.
+//
+func (c *Coordinator) Example(args *ExampleArgs, reply *ExampleReply) error {
+	reply.Y = args.X + 1
+	return nil
+}
+
+
+//
+// start a thread that listens for RPCs from worker.go
+//
+func (c *Coordinator) server() {
+	rpc.Register(c)
+	rpc.HandleHTTP()
+	//l, e := net.Listen("tcp", ":1234")
+	sockname := coordinatorSock()
+	os.Remove(sockname)
+	l, e := net.Listen("unix", sockname)
+	if e != nil {
+		log.Fatal("listen error:", e)
+	}
+	go http.Serve(l, nil)
+}
+
+//
+// main/mrcoordinator.go calls Done() periodically to find out
+// if the entire job has finished.
+//
+func (c *Coordinator) Done() bool {
+	ret := false
+
+	// Your code here.
+
+
+	return ret
+}
+
+//
+// create a Coordinator.
+// main/mrcoordinator.go calls this function.
+// nReduce is the number of reduce tasks to use.
+//
+func MakeCoordinator(files []string, nReduce int) *Coordinator {
+	c := Coordinator{}
+
+	// Your code here.
+
+
+	c.server()
+	return &c
+}
--- a/src/mr/rpc.go
+++ b/src/mr/rpc.go
@ -0,0 +1,36 @@
+package mr
+
+//
+// RPC definitions.
+//
+// remember to capitalize all names.
+//
+
+import "os"
+import "strconv"
+
+//
+// example to show how to declare the arguments
+// and reply for an RPC.
+//
+
+type ExampleArgs struct {
+	X int
+}
+
+type ExampleReply struct {
+	Y int
+}
+
+// Add your RPC definitions here.
+
+
+// Cook up a unique-ish UNIX-domain socket name
+// in /var/tmp, for the coordinator.
+// Can't use the current directory since
+// Athena AFS doesn't support UNIX-domain sockets.
+func coordinatorSock() string {
+	s := "/var/tmp/5840-mr-"
+	s += strconv.Itoa(os.Getuid())
+	return s
+}
--- a/src/mr/worker.go
+++ b/src/mr/worker.go
@ -0,0 +1,91 @@
+package mr
+
+import "fmt"
+import "log"
+import "net/rpc"
+import "hash/fnv"
+
+
+//
+// Map functions return a slice of KeyValue.
+//
+type KeyValue struct {
+	Key   string
+	Value string
+}
+
+//
+// use ihash(key) % NReduce to choose the reduce
+// task number for each KeyValue emitted by Map.
+//
+func ihash(key string) int {
+	h := fnv.New32a()
+	h.Write([]byte(key))
+	return int(h.Sum32() & 0x7fffffff)
+}
+
+
+//
+// main/mrworker.go calls this function.
+//
+func Worker(mapf func(string, string) []KeyValue,
+	reducef func(string, []string) string) {
+
+	// Your worker implementation here.
+
+	// uncomment to send the Example RPC to the coordinator.
+	// CallExample()
+
+}
+
+//
+// example function to show how to make an RPC call to the coordinator.
+//
+// the RPC argument and reply types are defined in rpc.go.
+//
+func CallExample() {
+
+	// declare an argument structure.
+	args := ExampleArgs{}
+
+	// fill in the argument(s).
+	args.X = 99
+
+	// declare a reply structure.
+	reply := ExampleReply{}
+
+	// send the RPC request, wait for the reply.
+	// the "Coordinator.Example" tells the
+	// receiving server that we'd like to call
+	// the Example() method of struct Coordinator.
+	ok := call("Coordinator.Example", &args, &reply)
+	if ok {
+		// reply.Y should be 100.
+		fmt.Printf("reply.Y %v\n", reply.Y)
+	} else {
+		fmt.Printf("call failed!\n")
+	}
+}
+
+//
+// send an RPC request to the coordinator, wait for the response.
+// usually returns true.
+// returns false if something goes wrong.
+//
+func call(rpcname string, args interface{}, reply interface{}) bool {
+	// c, err := rpc.DialHTTP("tcp", "127.0.0.1"+":1234")
+	sockname := coordinatorSock()
+	c, err := rpc.DialHTTP("unix", sockname)
+	if err != nil {
+		log.Fatal("dialing:", err)
+	}
+	defer c.Close()
+
+	err = c.Call(rpcname, args, reply)
+	if err == nil {
+		return true
+	}
+
+	fmt.Println(err)
+	return false
+}
--- a/src/mrapps/crash.go
+++ b/src/mrapps/crash.go
@ -0,0 +1,55 @@
+package main
+
+//
+// a MapReduce pseudo-application that sometimes crashes,
+// and sometimes takes a long time,
+// to test MapReduce's ability to recover.
+//
+// go build -buildmode=plugin crash.go
+//
+
+import "6.5840/mr"
+import crand "crypto/rand"
+import "math/big"
+import "strings"
+import "os"
+import "sort"
+import "strconv"
+import "time"
+
+func maybeCrash() {
+	max := big.NewInt(1000)
+	rr, _ := crand.Int(crand.Reader, max)
+	if rr.Int64() < 330 {
+		// crash!
+		os.Exit(1)
+	} else if rr.Int64() < 660 {
+		// delay for a while.
+		maxms := big.NewInt(10 * 1000)
+		ms, _ := crand.Int(crand.Reader, maxms)
+		time.Sleep(time.Duration(ms.Int64()) * time.Millisecond)
+	}
+}
+
+func Map(filename string, contents string) []mr.KeyValue {
+	maybeCrash()
+
+	kva := []mr.KeyValue{}
+	kva = append(kva, mr.KeyValue{"a", filename})
+	kva = append(kva, mr.KeyValue{"b", strconv.Itoa(len(filename))})
+	kva = append(kva, mr.KeyValue{"c", strconv.Itoa(len(contents))})
+	kva = append(kva, mr.KeyValue{"d", "xyzzy"})
+	return kva
+}
+
+func Reduce(key string, values []string) string {
+	maybeCrash()
+
+	// sort values to ensure deterministic output.
+	vv := make([]string, len(values))
+	copy(vv, values)
+	sort.Strings(vv)
+
+	val := strings.Join(vv, " ")
+	return val
+}
--- a/src/mrapps/early_exit.go
+++ b/src/mrapps/early_exit.go
@ -0,0 +1,36 @@
+package main
+
+//
+// a word-count application "plugin" for MapReduce.
+//
+// go build -buildmode=plugin wc_long.go
+//
+
+import (
+	"strconv"
+	"strings"
+	"time"
+
+	"6.5840/mr"
+)
+
+// The map function is called once for each file of input.
+// This map function just returns 1 for each file
+func Map(filename string, contents string) []mr.KeyValue {
+	kva := []mr.KeyValue{}
+	kva = append(kva, mr.KeyValue{filename, "1"})
+	return kva
+}
+
+// The reduce function is called once for each key generated by the
+// map tasks, with a list of all the values created for that key by
+// any map task.
+func Reduce(key string, values []string) string {
+	// some reduce tasks sleep for a long time; potentially seeing if
+	// a worker will accidentally exit early
+	if strings.Contains(key, "sherlock") || strings.Contains(key, "tom") {
+		time.Sleep(time.Duration(3 * time.Second))
+	}
+	// return the number of occurrences of this file.
+	return strconv.Itoa(len(values))
+}
--- a/src/mrapps/indexer.go
+++ b/src/mrapps/indexer.go
@ -0,0 +1,39 @@
+package main
+
+//
+// an indexing application "plugin" for MapReduce.
+//
+// go build -buildmode=plugin indexer.go
+//
+
+import "fmt"
+import "6.5840/mr"
+
+import "strings"
+import "unicode"
+import "sort"
+
+// The mapping function is called once for each piece of the input.
+// In this framework, the key is the name of the file that is being processed,
+// and the value is the file's contents. The return value should be a slice of
+// key/value pairs, each represented by a mr.KeyValue.
+func Map(document string, value string) (res []mr.KeyValue) {
+	m := make(map[string]bool)
+	words := strings.FieldsFunc(value, func(x rune) bool { return !unicode.IsLetter(x) })
+	for _, w := range words {
+		m[w] = true
+	}
+	for w := range m {
+		kv := mr.KeyValue{w, document}
+		res = append(res, kv)
+	}
+	return
+}
+
+// The reduce function is called once for each key generated by Map, with a
+// list of that key's string value (merged across all inputs). The return value
+// should be a single output value for that key.
+func Reduce(key string, values []string) string {
+	sort.Strings(values)
+	return fmt.Sprintf("%d %s", len(values), strings.Join(values, ","))
+}
--- a/src/mrapps/jobcount.go
+++ b/src/mrapps/jobcount.go
@ -0,0 +1,46 @@
+package main
+
+//
+// a MapReduce pseudo-application that counts the number of times map/reduce
+// tasks are run, to test whether jobs are assigned multiple times even when
+// there is no failure.
+//
+// go build -buildmode=plugin crash.go
+//
+
+import "6.5840/mr"
+import "math/rand"
+import "strings"
+import "strconv"
+import "time"
+import "fmt"
+import "os"
+import "io/ioutil"
+
+var count int
+
+func Map(filename string, contents string) []mr.KeyValue {
+	me := os.Getpid()
+	f := fmt.Sprintf("mr-worker-jobcount-%d-%d", me, count)
+	count++
+	err := ioutil.WriteFile(f, []byte("x"), 0666)
+	if err != nil {
+		panic(err)
+	}
+	time.Sleep(time.Duration(2000+rand.Intn(3000)) * time.Millisecond)
+	return []mr.KeyValue{mr.KeyValue{"a", "x"}}
+}
+
+func Reduce(key string, values []string) string {
+	files, err := ioutil.ReadDir(".")
+	if err != nil {
+		panic(err)
+	}
+	invocations := 0
+	for _, f := range files {
+		if strings.HasPrefix(f.Name(), "mr-worker-jobcount") {
+			invocations++
+		}
+	}
+	return strconv.Itoa(invocations)
+}
--- a/src/mrapps/mtiming.go
+++ b/src/mrapps/mtiming.go
@ -0,0 +1,91 @@
+package main
+
+//
+// a MapReduce pseudo-application to test that workers
+// execute map tasks in parallel.
+//
+// go build -buildmode=plugin mtiming.go
+//
+
+import "6.5840/mr"
+import "strings"
+import "fmt"
+import "os"
+import "syscall"
+import "time"
+import "sort"
+import "io/ioutil"
+
+func nparallel(phase string) int {
+	// create a file so that other workers will see that
+	// we're running at the same time as them.
+	pid := os.Getpid()
+	myfilename := fmt.Sprintf("mr-worker-%s-%d", phase, pid)
+	err := ioutil.WriteFile(myfilename, []byte("x"), 0666)
+	if err != nil {
+		panic(err)
+	}
+
+	// are any other workers running?
+	// find their PIDs by scanning directory for mr-worker-XXX files.
+	dd, err := os.Open(".")
+	if err != nil {
+		panic(err)
+	}
+	names, err := dd.Readdirnames(1000000)
+	if err != nil {
+		panic(err)
+	}
+	ret := 0
+	for _, name := range names {
+		var xpid int
+		pat := fmt.Sprintf("mr-worker-%s-%%d", phase)
+		n, err := fmt.Sscanf(name, pat, &xpid)
+		if n == 1 && err == nil {
+			err := syscall.Kill(xpid, 0)
+			if err == nil {
+				// if err == nil, xpid is alive.
+				ret += 1
+			}
+		}
+	}
+	dd.Close()
+
+	time.Sleep(1 * time.Second)
+
+	err = os.Remove(myfilename)
+	if err != nil {
+		panic(err)
+	}
+
+	return ret
+}
+
+func Map(filename string, contents string) []mr.KeyValue {
+	t0 := time.Now()
+	ts := float64(t0.Unix()) + (float64(t0.Nanosecond()) / 1000000000.0)
+	pid := os.Getpid()
+
+	n := nparallel("map")
+
+	kva := []mr.KeyValue{}
+	kva = append(kva, mr.KeyValue{
+		fmt.Sprintf("times-%v", pid),
+		fmt.Sprintf("%.1f", ts)})
+	kva = append(kva, mr.KeyValue{
+		fmt.Sprintf("parallel-%v", pid),
+		fmt.Sprintf("%d", n)})
+	return kva
+}
+
+func Reduce(key string, values []string) string {
+	//n := nparallel("reduce")
+
+	// sort values to ensure deterministic output.
+	vv := make([]string, len(values))
+	copy(vv, values)
+	sort.Strings(vv)
+
+	val := strings.Join(vv, " ")
+	return val
+}
--- a/src/mrapps/nocrash.go
+++ b/src/mrapps/nocrash.go
@ -0,0 +1,47 @@
+package main
+
+//
+// same as crash.go but doesn't actually crash.
+//
+// go build -buildmode=plugin nocrash.go
+//
+
+import "6.5840/mr"
+import crand "crypto/rand"
+import "math/big"
+import "strings"
+import "os"
+import "sort"
+import "strconv"
+
+func maybeCrash() {
+	max := big.NewInt(1000)
+	rr, _ := crand.Int(crand.Reader, max)
+	if false && rr.Int64() < 500 {
+		// crash!
+		os.Exit(1)
+	}
+}
+
+func Map(filename string, contents string) []mr.KeyValue {
+	maybeCrash()
+
+	kva := []mr.KeyValue{}
+	kva = append(kva, mr.KeyValue{"a", filename})
+	kva = append(kva, mr.KeyValue{"b", strconv.Itoa(len(filename))})
+	kva = append(kva, mr.KeyValue{"c", strconv.Itoa(len(contents))})
+	kva = append(kva, mr.KeyValue{"d", "xyzzy"})
+	return kva
+}
+
+func Reduce(key string, values []string) string {
+	maybeCrash()
+
+	// sort values to ensure deterministic output.
+	vv := make([]string, len(values))
+	copy(vv, values)
+	sort.Strings(vv)
+
+	val := strings.Join(vv, " ")
+	return val
+}
--- a/src/mrapps/rtiming.go
+++ b/src/mrapps/rtiming.go
@ -0,0 +1,84 @@
+package main
+
+//
+// a MapReduce pseudo-application to test that workers
+// execute reduce tasks in parallel.
+//
+// go build -buildmode=plugin rtiming.go
+//
+
+import "6.5840/mr"
+import "fmt"
+import "os"
+import "syscall"
+import "time"
+import "io/ioutil"
+
+func nparallel(phase string) int {
+	// create a file so that other workers will see that
+	// we're running at the same time as them.
+	pid := os.Getpid()
+	myfilename := fmt.Sprintf("mr-worker-%s-%d", phase, pid)
+	err := ioutil.WriteFile(myfilename, []byte("x"), 0666)
+	if err != nil {
+		panic(err)
+	}
+
+	// are any other workers running?
+	// find their PIDs by scanning directory for mr-worker-XXX files.
+	dd, err := os.Open(".")
+	if err != nil {
+		panic(err)
+	}
+	names, err := dd.Readdirnames(1000000)
+	if err != nil {
+		panic(err)
+	}
+	ret := 0
+	for _, name := range names {
+		var xpid int
+		pat := fmt.Sprintf("mr-worker-%s-%%d", phase)
+		n, err := fmt.Sscanf(name, pat, &xpid)
+		if n == 1 && err == nil {
+			err := syscall.Kill(xpid, 0)
+			if err == nil {
+				// if err == nil, xpid is alive.
+				ret += 1
+			}
+		}
+	}
+	dd.Close()
+
+	time.Sleep(1 * time.Second)
+
+	err = os.Remove(myfilename)
+	if err != nil {
+		panic(err)
+	}
+
+	return ret
+}
+
+func Map(filename string, contents string) []mr.KeyValue {
+
+	kva := []mr.KeyValue{}
+	kva = append(kva, mr.KeyValue{"a", "1"})
+	kva = append(kva, mr.KeyValue{"b", "1"})
+	kva = append(kva, mr.KeyValue{"c", "1"})
+	kva = append(kva, mr.KeyValue{"d", "1"})
+	kva = append(kva, mr.KeyValue{"e", "1"})
+	kva = append(kva, mr.KeyValue{"f", "1"})
+	kva = append(kva, mr.KeyValue{"g", "1"})
+	kva = append(kva, mr.KeyValue{"h", "1"})
+	kva = append(kva, mr.KeyValue{"i", "1"})
+	kva = append(kva, mr.KeyValue{"j", "1"})
+	return kva
+}
+
+func Reduce(key string, values []string) string {
+	n := nparallel("reduce")
+
+	val := fmt.Sprintf("%d", n)
+
+	return val
+}
--- a/src/mrapps/wc.go
+++ b/src/mrapps/wc.go
@ -0,0 +1,40 @@
+package main
+
+//
+// a word-count application "plugin" for MapReduce.
+//
+// go build -buildmode=plugin wc.go
+//
+
+import "6.5840/mr"
+import "unicode"
+import "strings"
+import "strconv"
+
+// The map function is called once for each file of input. The first
+// argument is the name of the input file, and the second is the
+// file's complete contents. You should ignore the input file name,
+// and look only at the contents argument. The return value is a slice
+// of key/value pairs.
+func Map(filename string, contents string) []mr.KeyValue {
+	// function to detect word separators.
+	ff := func(r rune) bool { return !unicode.IsLetter(r) }
+
+	// split contents into an array of words.
+	words := strings.FieldsFunc(contents, ff)
+
+	kva := []mr.KeyValue{}
+	for _, w := range words {
+		kv := mr.KeyValue{w, "1"}
+		kva = append(kva, kv)
+	}
+	return kva
+}
+
+// The reduce function is called once for each key generated by the
+// map tasks, with a list of all the values created for that key by
+// any map task.
+func Reduce(key string, values []string) string {
+	// return the number of occurrences of this word.
+	return strconv.Itoa(len(values))
+}
--- a/src/raft/config.go
+++ b/src/raft/config.go
@ -0,0 +1,648 @@
+package raft
+
+//
+// support for Raft tester.
+//
+// we will use the original config.go to test your code for grading.
+// so, while you can modify this code to help you debug, please
+// test with the original before submitting.
+//
+
+import "6.5840/labgob"
+import "6.5840/labrpc"
+import "bytes"
+import "log"
+import "sync"
+import "sync/atomic"
+import "testing"
+import "runtime"
+import "math/rand"
+import crand "crypto/rand"
+import "math/big"
+import "encoding/base64"
+import "time"
+import "fmt"
+
+func randstring(n int) string {
+	b := make([]byte, 2*n)
+	crand.Read(b)
+	s := base64.URLEncoding.EncodeToString(b)
+	return s[0:n]
+}
+
+func makeSeed() int64 {
+	max := big.NewInt(int64(1) << 62)
+	bigx, _ := crand.Int(crand.Reader, max)
+	x := bigx.Int64()
+	return x
+}
+
+type config struct {
+	mu          sync.Mutex
+	t           *testing.T
+	finished    int32
+	net         *labrpc.Network
+	n           int
+	rafts       []*Raft
+	applyErr    []string // from apply channel readers
+	connected   []bool   // whether each server is on the net
+	saved       []*Persister
+	endnames    [][]string            // the port file names each sends to
+	logs        []map[int]interface{} // copy of each server's committed entries
+	lastApplied []int
+	start       time.Time // time at which make_config() was called
+	// begin()/end() statistics
+	t0        time.Time // time at which test_test.go called cfg.begin()
+	rpcs0     int       // rpcTotal() at start of test
+	cmds0     int       // number of agreements
+	bytes0    int64
+	maxIndex  int
+	maxIndex0 int
+}
+
+var ncpu_once sync.Once
+
+func make_config(t *testing.T, n int, unreliable bool, snapshot bool) *config {
+	ncpu_once.Do(func() {
+		if runtime.NumCPU() < 2 {
+			fmt.Printf("warning: only one CPU, which may conceal locking bugs\n")
+		}
+		rand.Seed(makeSeed())
+	})
+	runtime.GOMAXPROCS(4)
+	cfg := &config{}
+	cfg.t = t
+	cfg.net = labrpc.MakeNetwork()
+	cfg.n = n
+	cfg.applyErr = make([]string, cfg.n)
+	cfg.rafts = make([]*Raft, cfg.n)
+	cfg.connected = make([]bool, cfg.n)
+	cfg.saved = make([]*Persister, cfg.n)
+	cfg.endnames = make([][]string, cfg.n)
+	cfg.logs = make([]map[int]interface{}, cfg.n)
+	cfg.lastApplied = make([]int, cfg.n)
+	cfg.start = time.Now()
+
+	cfg.setunreliable(unreliable)
+
+	cfg.net.LongDelays(true)
+
+	applier := cfg.applier
+	if snapshot {
+		applier = cfg.applierSnap
+	}
+	// create a full set of Rafts.
+	for i := 0; i < cfg.n; i++ {
+		cfg.logs[i] = map[int]interface{}{}
+		cfg.start1(i, applier)
+	}
+
+	// connect everyone
+	for i := 0; i < cfg.n; i++ {
+		cfg.connect(i)
+	}
+
+	return cfg
+}
+
+// shut down a Raft server but save its persistent state.
+func (cfg *config) crash1(i int) {
+	cfg.disconnect(i)
+	cfg.net.DeleteServer(i) // disable client connections to the server.
+
+	cfg.mu.Lock()
+	defer cfg.mu.Unlock()
+
+	// a fresh persister, in case old instance
+	// continues to update the Persister.
+	// but copy old persister's content so that we always
+	// pass Make() the last persisted state.
+	if cfg.saved[i] != nil {
+		cfg.saved[i] = cfg.saved[i].Copy()
+	}
+
+	rf := cfg.rafts[i]
+	if rf != nil {
+		cfg.mu.Unlock()
+		rf.Kill()
+		cfg.mu.Lock()
+		cfg.rafts[i] = nil
+	}
+
+	if cfg.saved[i] != nil {
+		raftlog := cfg.saved[i].ReadRaftState()
+		snapshot := cfg.saved[i].ReadSnapshot()
+		cfg.saved[i] = &Persister{}
+		cfg.saved[i].Save(raftlog, snapshot)
+	}
+}
+
+func (cfg *config) checkLogs(i int, m ApplyMsg) (string, bool) {
+	err_msg := ""
+	v := m.Command
+	for j := 0; j < len(cfg.logs); j++ {
+		if old, oldok := cfg.logs[j][m.CommandIndex]; oldok && old != v {
+			log.Printf("%v: log %v; server %v\n", i, cfg.logs[i], cfg.logs[j])
+			// some server has already committed a different value for this entry!
+			err_msg = fmt.Sprintf("commit index=%v server=%v %v != server=%v %v",
+				m.CommandIndex, i, m.Command, j, old)
+		}
+	}
+	_, prevok := cfg.logs[i][m.CommandIndex-1]
+	cfg.logs[i][m.CommandIndex] = v
+	if m.CommandIndex > cfg.maxIndex {
+		cfg.maxIndex = m.CommandIndex
+	}
+	return err_msg, prevok
+}
+
+// applier reads message from apply ch and checks that they match the log
+// contents
+func (cfg *config) applier(i int, applyCh chan ApplyMsg) {
+	for m := range applyCh {
+		if m.CommandValid == false {
+			// ignore other types of ApplyMsg
+		} else {
+			cfg.mu.Lock()
+			err_msg, prevok := cfg.checkLogs(i, m)
+			cfg.mu.Unlock()
+			if m.CommandIndex > 1 && prevok == false {
+				err_msg = fmt.Sprintf("server %v apply out of order %v", i, m.CommandIndex)
+			}
+			if err_msg != "" {
+				log.Fatalf("apply error: %v", err_msg)
+				cfg.applyErr[i] = err_msg
+				// keep reading after error so that Raft doesn't block
+				// holding locks...
+			}
+		}
+	}
+}
+
+// returns "" or error string
+func (cfg *config) ingestSnap(i int, snapshot []byte, index int) string {
+	if snapshot == nil {
+		log.Fatalf("nil snapshot")
+		return "nil snapshot"
+	}
+	r := bytes.NewBuffer(snapshot)
+	d := labgob.NewDecoder(r)
+	var lastIncludedIndex int
+	var xlog []interface{}
+	if d.Decode(&lastIncludedIndex) != nil ||
+		d.Decode(&xlog) != nil {
+		log.Fatalf("snapshot decode error")
+		return "snapshot Decode() error"
+	}
+	if index != -1 && index != lastIncludedIndex {
+		err := fmt.Sprintf("server %v snapshot doesn't match m.SnapshotIndex", i)
+		return err
+	}
+	cfg.logs[i] = map[int]interface{}{}
+	for j := 0; j < len(xlog); j++ {
+		cfg.logs[i][j] = xlog[j]
+	}
+	cfg.lastApplied[i] = lastIncludedIndex
+	return ""
+}
+
+const SnapShotInterval = 10
+
+// periodically snapshot raft state
+func (cfg *config) applierSnap(i int, applyCh chan ApplyMsg) {
+	cfg.mu.Lock()
+	rf := cfg.rafts[i]
+	cfg.mu.Unlock()
+	if rf == nil {
+		return // ???
+	}
+
+	for m := range applyCh {
+		err_msg := ""
+		if m.SnapshotValid {
+			cfg.mu.Lock()
+			err_msg = cfg.ingestSnap(i, m.Snapshot, m.SnapshotIndex)
+			cfg.mu.Unlock()
+		} else if m.CommandValid {
+			if m.CommandIndex != cfg.lastApplied[i]+1 {
+				err_msg = fmt.Sprintf("server %v apply out of order, expected index %v, got %v", i, cfg.lastApplied[i]+1, m.CommandIndex)
+			}
+
+			if err_msg == "" {
+				cfg.mu.Lock()
+				var prevok bool
+				err_msg, prevok = cfg.checkLogs(i, m)
+				cfg.mu.Unlock()
+				if m.CommandIndex > 1 && prevok == false {
+					err_msg = fmt.Sprintf("server %v apply out of order %v", i, m.CommandIndex)
+				}
+			}
+
+			cfg.mu.Lock()
+			cfg.lastApplied[i] = m.CommandIndex
+			cfg.mu.Unlock()
+
+			if (m.CommandIndex+1)%SnapShotInterval == 0 {
+				w := new(bytes.Buffer)
+				e := labgob.NewEncoder(w)
+				e.Encode(m.CommandIndex)
+				var xlog []interface{}
+				for j := 0; j <= m.CommandIndex; j++ {
+					xlog = append(xlog, cfg.logs[i][j])
+				}
+				e.Encode(xlog)
+				rf.Snapshot(m.CommandIndex, w.Bytes())
+			}
+		} else {
+			// Ignore other types of ApplyMsg.
+		}
+		if err_msg != "" {
+			log.Fatalf("apply error: %v", err_msg)
+			cfg.applyErr[i] = err_msg
+			// keep reading after error so that Raft doesn't block
+			// holding locks...
+		}
+	}
+}
+
+// start or re-start a Raft.
+// if one already exists, "kill" it first.
+// allocate new outgoing port file names, and a new
+// state persister, to isolate previous instance of
+// this server. since we cannot really kill it.
+func (cfg *config) start1(i int, applier func(int, chan ApplyMsg)) {
+	cfg.crash1(i)
+
+	// a fresh set of outgoing ClientEnd names.
+	// so that old crashed instance's ClientEnds can't send.
+	cfg.endnames[i] = make([]string, cfg.n)
+	for j := 0; j < cfg.n; j++ {
+		cfg.endnames[i][j] = randstring(20)
+	}
+
+	// a fresh set of ClientEnds.
+	ends := make([]*labrpc.ClientEnd, cfg.n)
+	for j := 0; j < cfg.n; j++ {
+		ends[j] = cfg.net.MakeEnd(cfg.endnames[i][j])
+		cfg.net.Connect(cfg.endnames[i][j], j)
+	}
+
+	cfg.mu.Lock()
+
+	cfg.lastApplied[i] = 0
+
+	// a fresh persister, so old instance doesn't overwrite
+	// new instance's persisted state.
+	// but copy old persister's content so that we always
+	// pass Make() the last persisted state.
+	if cfg.saved[i] != nil {
+		cfg.saved[i] = cfg.saved[i].Copy()
+
+		snapshot := cfg.saved[i].ReadSnapshot()
+		if snapshot != nil && len(snapshot) > 0 {
+			// mimic KV server and process snapshot now.
+			// ideally Raft should send it up on applyCh...
+			err := cfg.ingestSnap(i, snapshot, -1)
+			if err != "" {
+				cfg.t.Fatal(err)
+			}
+		}
+	} else {
+		cfg.saved[i] = MakePersister()
+	}
+
+	cfg.mu.Unlock()
+
+	applyCh := make(chan ApplyMsg)
+
+	rf := Make(ends, i, cfg.saved[i], applyCh)
+
+	cfg.mu.Lock()
+	cfg.rafts[i] = rf
+	cfg.mu.Unlock()
+
+	go applier(i, applyCh)
+
+	svc := labrpc.MakeService(rf)
+	srv := labrpc.MakeServer()
+	srv.AddService(svc)
+	cfg.net.AddServer(i, srv)
+}
+
+func (cfg *config) checkTimeout() {
+	// enforce a two minute real-time limit on each test
+	if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second {
+		cfg.t.Fatal("test took longer than 120 seconds")
+	}
+}
+
+func (cfg *config) checkFinished() bool {
+	z := atomic.LoadInt32(&cfg.finished)
+	return z != 0
+}
+
+func (cfg *config) cleanup() {
+	atomic.StoreInt32(&cfg.finished, 1)
+	for i := 0; i < len(cfg.rafts); i++ {
+		if cfg.rafts[i] != nil {
+			cfg.rafts[i].Kill()
+		}
+	}
+	cfg.net.Cleanup()
+	cfg.checkTimeout()
+}
+
+// attach server i to the net.
+func (cfg *config) connect(i int) {
+	// fmt.Printf("connect(%d)\n", i)
+
+	cfg.connected[i] = true
+
+	// outgoing ClientEnds
+	for j := 0; j < cfg.n; j++ {
+		if cfg.connected[j] {
+			endname := cfg.endnames[i][j]
+			cfg.net.Enable(endname, true)
+		}
+	}
+
+	// incoming ClientEnds
+	for j := 0; j < cfg.n; j++ {
+		if cfg.connected[j] {
+			endname := cfg.endnames[j][i]
+			cfg.net.Enable(endname, true)
+		}
+	}
+}
+
+// detach server i from the net.
+func (cfg *config) disconnect(i int) {
+	// fmt.Printf("disconnect(%d)\n", i)
+
+	cfg.connected[i] = false
+
+	// outgoing ClientEnds
+	for j := 0; j < cfg.n; j++ {
+		if cfg.endnames[i] != nil {
+			endname := cfg.endnames[i][j]
+			cfg.net.Enable(endname, false)
+		}
+	}
+
+	// incoming ClientEnds
+	for j := 0; j < cfg.n; j++ {
+		if cfg.endnames[j] != nil {
+			endname := cfg.endnames[j][i]
+			cfg.net.Enable(endname, false)
+		}
+	}
+}
+
+func (cfg *config) rpcCount(server int) int {
+	return cfg.net.GetCount(server)
+}
+
+func (cfg *config) rpcTotal() int {
+	return cfg.net.GetTotalCount()
+}
+
+func (cfg *config) setunreliable(unrel bool) {
+	cfg.net.Reliable(!unrel)
+}
+
+func (cfg *config) bytesTotal() int64 {
+	return cfg.net.GetTotalBytes()
+}
+
+func (cfg *config) setlongreordering(longrel bool) {
+	cfg.net.LongReordering(longrel)
+}
+
+// check that one of the connected servers thinks
+// it is the leader, and that no other connected
+// server thinks otherwise.
+//
+// try a few times in case re-elections are needed.
+func (cfg *config) checkOneLeader() int {
+	for iters := 0; iters < 10; iters++ {
+		ms := 450 + (rand.Int63() % 100)
+		time.Sleep(time.Duration(ms) * time.Millisecond)
+
+		leaders := make(map[int][]int)
+		for i := 0; i < cfg.n; i++ {
+			if cfg.connected[i] {
+				if term, leader := cfg.rafts[i].GetState(); leader {
+					leaders[term] = append(leaders[term], i)
+				}
+			}
+		}
+
+		lastTermWithLeader := -1
+		for term, leaders := range leaders {
+			if len(leaders) > 1 {
+				cfg.t.Fatalf("term %d has %d (>1) leaders", term, len(leaders))
+			}
+			if term > lastTermWithLeader {
+				lastTermWithLeader = term
+			}
+		}
+
+		if len(leaders) != 0 {
+			return leaders[lastTermWithLeader][0]
+		}
+	}
+	cfg.t.Fatalf("expected one leader, got none")
+	return -1
+}
+
+// check that everyone agrees on the term.
+func (cfg *config) checkTerms() int {
+	term := -1
+	for i := 0; i < cfg.n; i++ {
+		if cfg.connected[i] {
+			xterm, _ := cfg.rafts[i].GetState()
+			if term == -1 {
+				term = xterm
+			} else if term != xterm {
+				cfg.t.Fatalf("servers disagree on term")
+			}
+		}
+	}
+	return term
+}
+
+// check that none of the connected servers
+// thinks it is the leader.
+func (cfg *config) checkNoLeader() {
+	for i := 0; i < cfg.n; i++ {
+		if cfg.connected[i] {
+			_, is_leader := cfg.rafts[i].GetState()
+			if is_leader {
+				cfg.t.Fatalf("expected no leader among connected servers, but %v claims to be leader", i)
+			}
+		}
+	}
+}
+
+// how many servers think a log entry is committed?
+func (cfg *config) nCommitted(index int) (int, interface{}) {
+	count := 0
+	var cmd interface{} = nil
+	for i := 0; i < len(cfg.rafts); i++ {
+		if cfg.applyErr[i] != "" {
+			cfg.t.Fatal(cfg.applyErr[i])
+		}
+
+		cfg.mu.Lock()
+		cmd1, ok := cfg.logs[i][index]
+		cfg.mu.Unlock()
+
+		if ok {
+			if count > 0 && cmd != cmd1 {
+				cfg.t.Fatalf("committed values do not match: index %v, %v, %v",
+					index, cmd, cmd1)
+			}
+			count += 1
+			cmd = cmd1
+		}
+	}
+	return count, cmd
+}
+
+// wait for at least n servers to commit.
+// but don't wait forever.
+func (cfg *config) wait(index int, n int, startTerm int) interface{} {
+	to := 10 * time.Millisecond
+	for iters := 0; iters < 30; iters++ {
+		nd, _ := cfg.nCommitted(index)
+		if nd >= n {
+			break
+		}
+		time.Sleep(to)
+		if to < time.Second {
+			to *= 2
+		}
+		if startTerm > -1 {
+			for _, r := range cfg.rafts {
+				if t, _ := r.GetState(); t > startTerm {
+					// someone has moved on
+					// can no longer guarantee that we'll "win"
+					return -1
+				}
+			}
+		}
+	}
+	nd, cmd := cfg.nCommitted(index)
+	if nd < n {
+		cfg.t.Fatalf("only %d decided for index %d; wanted %d",
+			nd, index, n)
+	}
+	return cmd
+}
+
+// do a complete agreement.
+// it might choose the wrong leader initially,
+// and have to re-submit after giving up.
+// entirely gives up after about 10 seconds.
+// indirectly checks that the servers agree on the
+// same value, since nCommitted() checks this,
+// as do the threads that read from applyCh.
+// returns index.
+// if retry==true, may submit the command multiple
+// times, in case a leader fails just after Start().
+// if retry==false, calls Start() only once, in order
+// to simplify the early Lab 3B tests.
+func (cfg *config) one(cmd interface{}, expectedServers int, retry bool) int {
+	t0 := time.Now()
+	starts := 0
+	for time.Since(t0).Seconds() < 10 && cfg.checkFinished() == false {
+		// try all the servers, maybe one is the leader.
+		index := -1
+		for si := 0; si < cfg.n; si++ {
+			starts = (starts + 1) % cfg.n
+			var rf *Raft
+			cfg.mu.Lock()
+			if cfg.connected[starts] {
+				rf = cfg.rafts[starts]
+			}
+			cfg.mu.Unlock()
+			if rf != nil {
+				index1, _, ok := rf.Start(cmd)
+				if ok {
+					index = index1
+					break
+				}
+			}
+		}
+
+		if index != -1 {
+			// somebody claimed to be the leader and to have
+			// submitted our command; wait a while for agreement.
+			t1 := time.Now()
+			for time.Since(t1).Seconds() < 2 {
+				nd, cmd1 := cfg.nCommitted(index)
+				if nd > 0 && nd >= expectedServers {
+					// committed
+					if cmd1 == cmd {
+						// and it was the command we submitted.
+						return index
+					}
+				}
+				time.Sleep(20 * time.Millisecond)
+			}
+			if retry == false {
+				cfg.t.Fatalf("one(%v) failed to reach agreement", cmd)
+			}
+		} else {
+			time.Sleep(50 * time.Millisecond)
+		}
+	}
+	if cfg.checkFinished() == false {
+		cfg.t.Fatalf("one(%v) failed to reach agreement", cmd)
+	}
+	return -1
+}
+
+// start a Test.
+// print the Test message.
+// e.g. cfg.begin("Test (3B): RPC counts aren't too high")
+func (cfg *config) begin(description string) {
+	fmt.Printf("%s ...\n", description)
+	cfg.t0 = time.Now()
+	cfg.rpcs0 = cfg.rpcTotal()
+	cfg.bytes0 = cfg.bytesTotal()
+	cfg.cmds0 = 0
+	cfg.maxIndex0 = cfg.maxIndex
+}
+
+// end a Test -- the fact that we got here means there
+// was no failure.
+// print the Passed message,
+// and some performance numbers.
+func (cfg *config) end() {
+	cfg.checkTimeout()
+	if cfg.t.Failed() == false {
+		cfg.mu.Lock()
+		t := time.Since(cfg.t0).Seconds()       // real time
+		npeers := cfg.n                         // number of Raft peers
+		nrpc := cfg.rpcTotal() - cfg.rpcs0      // number of RPC sends
+		nbytes := cfg.bytesTotal() - cfg.bytes0 // number of bytes
+		ncmds := cfg.maxIndex - cfg.maxIndex0   // number of Raft agreements reported
+		cfg.mu.Unlock()
+
+		fmt.Printf("  ... Passed --")
+		fmt.Printf("  %4.1f  %d %4d %7d %4d\n", t, npeers, nrpc, nbytes, ncmds)
+	}
+}
+
+// Maximum log size across all servers
+func (cfg *config) LogSize() int {
+	logsize := 0
+	for i := 0; i < cfg.n; i++ {
+		n := cfg.saved[i].RaftStateSize()
+		if n > logsize {
+			logsize = n
+		}
+	}
+	return logsize
+}
--- a/src/raft/persister.go
+++ b/src/raft/persister.go
@ -0,0 +1,70 @@
+package raft
+
+//
+// support for Raft and kvraft to save persistent
+// Raft state (log &c) and k/v server snapshots.
+//
+// we will use the original persister.go to test your code for grading.
+// so, while you can modify this code to help you debug, please
+// test with the original before submitting.
+//
+
+import "sync"
+
+type Persister struct {
+	mu        sync.Mutex
+	raftstate []byte
+	snapshot  []byte
+}
+
+func MakePersister() *Persister {
+	return &Persister{}
+}
+
+func clone(orig []byte) []byte {
+	x := make([]byte, len(orig))
+	copy(x, orig)
+	return x
+}
+
+func (ps *Persister) Copy() *Persister {
+	ps.mu.Lock()
+	defer ps.mu.Unlock()
+	np := MakePersister()
+	np.raftstate = ps.raftstate
+	np.snapshot = ps.snapshot
+	return np
+}
+
+func (ps *Persister) ReadRaftState() []byte {
+	ps.mu.Lock()
+	defer ps.mu.Unlock()
+	return clone(ps.raftstate)
+}
+
+func (ps *Persister) RaftStateSize() int {
+	ps.mu.Lock()
+	defer ps.mu.Unlock()
+	return len(ps.raftstate)
+}
+
+// Save both Raft state and K/V snapshot as a single atomic action,
+// to help avoid them getting out of sync.
+func (ps *Persister) Save(raftstate []byte, snapshot []byte) {
+	ps.mu.Lock()
+	defer ps.mu.Unlock()
+	ps.raftstate = clone(raftstate)
+	ps.snapshot = clone(snapshot)
+}
+
+func (ps *Persister) ReadSnapshot() []byte {
+	ps.mu.Lock()
+	defer ps.mu.Unlock()
+	return clone(ps.snapshot)
+}
+
+func (ps *Persister) SnapshotSize() int {
+	ps.mu.Lock()
+	defer ps.mu.Unlock()
+	return len(ps.snapshot)
+}
--- a/src/raft/raft.go
+++ b/src/raft/raft.go
@ -0,0 +1,259 @@
+package raft
+
+//
+// this is an outline of the API that raft must expose to
+// the service (or tester). see comments below for
+// each of these functions for more details.
+//
+// rf = Make(...)
+//   create a new Raft server.
+// rf.Start(command interface{}) (index, term, isleader)
+//   start agreement on a new log entry
+// rf.GetState() (term, isLeader)
+//   ask a Raft for its current term, and whether it thinks it is leader
+// ApplyMsg
+//   each time a new entry is committed to the log, each Raft peer
+//   should send an ApplyMsg to the service (or tester)
+//   in the same server.
+//
+
+import (
+	//	"bytes"
+	"math/rand"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	//	"6.5840/labgob"
+	"6.5840/labrpc"
+)
+
+
+// as each Raft peer becomes aware that successive log entries are
+// committed, the peer should send an ApplyMsg to the service (or
+// tester) on the same server, via the applyCh passed to Make(). set
+// CommandValid to true to indicate that the ApplyMsg contains a newly
+// committed log entry.
+//
+// in part 3D you'll want to send other kinds of messages (e.g.,
+// snapshots) on the applyCh, but set CommandValid to false for these
+// other uses.
+type ApplyMsg struct {
+	CommandValid bool
+	Command      interface{}
+	CommandIndex int
+
+	// For 3D:
+	SnapshotValid bool
+	Snapshot      []byte
+	SnapshotTerm  int
+	SnapshotIndex int
+}
+
+// A Go object implementing a single Raft peer.
+type Raft struct {
+	mu        sync.Mutex          // Lock to protect shared access to this peer's state
+	peers     []*labrpc.ClientEnd // RPC end points of all peers
+	persister *Persister          // Object to hold this peer's persisted state
+	me        int                 // this peer's index into peers[]
+	dead      int32               // set by Kill()
+
+	// Your data here (3A, 3B, 3C).
+	// Look at the paper's Figure 2 for a description of what
+	// state a Raft server must maintain.
+
+}
+
+// return currentTerm and whether this server
+// believes it is the leader.
+func (rf *Raft) GetState() (int, bool) {
+
+	var term int
+	var isleader bool
+	// Your code here (3A).
+	return term, isleader
+}
+
+// save Raft's persistent state to stable storage,
+// where it can later be retrieved after a crash and restart.
+// see paper's Figure 2 for a description of what should be persistent.
+// before you've implemented snapshots, you should pass nil as the
+// second argument to persister.Save().
+// after you've implemented snapshots, pass the current snapshot
+// (or nil if there's not yet a snapshot).
+func (rf *Raft) persist() {
+	// Your code here (3C).
+	// Example:
+	// w := new(bytes.Buffer)
+	// e := labgob.NewEncoder(w)
+	// e.Encode(rf.xxx)
+	// e.Encode(rf.yyy)
+	// raftstate := w.Bytes()
+	// rf.persister.Save(raftstate, nil)
+}
+
+
+// restore previously persisted state.
+func (rf *Raft) readPersist(data []byte) {
+	if data == nil || len(data) < 1 { // bootstrap without any state?
+		return
+	}
+	// Your code here (3C).
+	// Example:
+	// r := bytes.NewBuffer(data)
+	// d := labgob.NewDecoder(r)
+	// var xxx
+	// var yyy
+	// if d.Decode(&xxx) != nil ||
+	//    d.Decode(&yyy) != nil {
+	//   error...
+	// } else {
+	//   rf.xxx = xxx
+	//   rf.yyy = yyy
+	// }
+}
+
+
+// the service says it has created a snapshot that has
+// all info up to and including index. this means the
+// service no longer needs the log through (and including)
+// that index. Raft should now trim its log as much as possible.
+func (rf *Raft) Snapshot(index int, snapshot []byte) {
+	// Your code here (3D).
+
+}
+
+
+// example RequestVote RPC arguments structure.
+// field names must start with capital letters!
+type RequestVoteArgs struct {
+	// Your data here (3A, 3B).
+}
+
+// example RequestVote RPC reply structure.
+// field names must start with capital letters!
+type RequestVoteReply struct {
+	// Your data here (3A).
+}
+
+// example RequestVote RPC handler.
+func (rf *Raft) RequestVote(args *RequestVoteArgs, reply *RequestVoteReply) {
+	// Your code here (3A, 3B).
+}
+
+// example code to send a RequestVote RPC to a server.
+// server is the index of the target server in rf.peers[].
+// expects RPC arguments in args.
+// fills in *reply with RPC reply, so caller should
+// pass &reply.
+// the types of the args and reply passed to Call() must be
+// the same as the types of the arguments declared in the
+// handler function (including whether they are pointers).
+//
+// The labrpc package simulates a lossy network, in which servers
+// may be unreachable, and in which requests and replies may be lost.
+// Call() sends a request and waits for a reply. If a reply arrives
+// within a timeout interval, Call() returns true; otherwise
+// Call() returns false. Thus Call() may not return for a while.
+// A false return can be caused by a dead server, a live server that
+// can't be reached, a lost request, or a lost reply.
+//
+// Call() is guaranteed to return (perhaps after a delay) *except* if the
+// handler function on the server side does not return.  Thus there
+// is no need to implement your own timeouts around Call().
+//
+// look at the comments in ../labrpc/labrpc.go for more details.
+//
+// if you're having trouble getting RPC to work, check that you've
+// capitalized all field names in structs passed over RPC, and
+// that the caller passes the address of the reply struct with &, not
+// the struct itself.
+func (rf *Raft) sendRequestVote(server int, args *RequestVoteArgs, reply *RequestVoteReply) bool {
+	ok := rf.peers[server].Call("Raft.RequestVote", args, reply)
+	return ok
+}
+
+
+// the service using Raft (e.g. a k/v server) wants to start
+// agreement on the next command to be appended to Raft's log. if this
+// server isn't the leader, returns false. otherwise start the
+// agreement and return immediately. there is no guarantee that this
+// command will ever be committed to the Raft log, since the leader
+// may fail or lose an election. even if the Raft instance has been killed,
+// this function should return gracefully.
+//
+// the first return value is the index that the command will appear at
+// if it's ever committed. the second return value is the current
+// term. the third return value is true if this server believes it is
+// the leader.
+func (rf *Raft) Start(command interface{}) (int, int, bool) {
+	index := -1
+	term := -1
+	isLeader := true
+
+	// Your code here (3B).
+
+
+	return index, term, isLeader
+}
+
+// the tester doesn't halt goroutines created by Raft after each test,
+// but it does call the Kill() method. your code can use killed() to
+// check whether Kill() has been called. the use of atomic avoids the
+// need for a lock.
+//
+// the issue is that long-running goroutines use memory and may chew
+// up CPU time, perhaps causing later tests to fail and generating
+// confusing debug output. any goroutine with a long-running loop
+// should call killed() to check whether it should stop.
+func (rf *Raft) Kill() {
+	atomic.StoreInt32(&rf.dead, 1)
+	// Your code here, if desired.
+}
+
+func (rf *Raft) killed() bool {
+	z := atomic.LoadInt32(&rf.dead)
+	return z == 1
+}
+
+func (rf *Raft) ticker() {
+	for rf.killed() == false {
+
+		// Your code here (3A)
+		// Check if a leader election should be started.
+
+
+		// pause for a random amount of time between 50 and 350
+		// milliseconds.
+		ms := 50 + (rand.Int63() % 300)
+		time.Sleep(time.Duration(ms) * time.Millisecond)
+	}
+}
+
+// the service or tester wants to create a Raft server. the ports
+// of all the Raft servers (including this one) are in peers[]. this
+// server's port is peers[me]. all the servers' peers[] arrays
+// have the same order. persister is a place for this server to
+// save its persistent state, and also initially holds the most
+// recent saved state, if any. applyCh is a channel on which the
+// tester or service expects Raft to send ApplyMsg messages.
+// Make() must return quickly, so it should start goroutines
+// for any long-running work.
+func Make(peers []*labrpc.ClientEnd, me int,
+	persister *Persister, applyCh chan ApplyMsg) *Raft {
+	rf := &Raft{}
+	rf.peers = peers
+	rf.persister = persister
+	rf.me = me
+
+	// Your initialization code here (3A, 3B, 3C).
+
+	// initialize from state persisted before a crash
+	rf.readPersist(persister.ReadRaftState())
+
+	// start ticker goroutine to start elections
+	go rf.ticker()
+
+
+	return rf
+}
--- a/src/raft/test_test.go
+++ b/src/raft/test_test.go
--- a/src/raft/util.go
+++ b/src/raft/util.go
@ -0,0 +1,12 @@
+package raft
+
+import "log"
+
+// Debugging
+const Debug = false
+
+func DPrintf(format string, a ...interface{}) {
+	if Debug {
+		log.Printf(format, a...)
+	}
+}
--- a/src/shardkv1/client.go
+++ b/src/shardkv1/client.go
@ -0,0 +1,51 @@
+package shardkv
+
+//
+// client code to talk to a sharded key/value service.
+//
+// the client uses the shardctrler's clerk to query for the current
+// configuration and find the assignment of shards (keys) to groups,
+// and then talks to the group that holds the key's shard.
+//
+
+import (
+
+	"6.5840/kvsrv1/rpc"
+	"6.5840/kvtest1"
+	"6.5840/shardkv1/shardctrler"
+	"6.5840/tester1"
+)
+
+type Clerk struct {
+	clnt *tester.Clnt
+	qck  *shardctrler.QueryClerk
+	// You will have to modify this struct.
+}
+
+// The tester calls MakeClerk and passes in a clerk for the
+// shardctrler with only the Query method.
+func MakeClerk(clnt *tester.Clnt, qck *shardctrler.QueryClerk) kvtest.IKVClerk {
+	ck := &Clerk{
+		clnt: clnt,
+		qck:  qck,
+	}
+	// You'll have to add code here.
+	return ck
+}
+
+
+// Get a key from a shardgrp.  You can use shardcfg.Key2Shard(key) to
+// find the shard responsible for the key and ck.qck.Query() to read
+// the current configuration and lookup the servers in the group
+// responsible for key.  You can make a clerk for that group by
+// calling shardgrp.MakeClerk(ck.clnt, servers).
+func (ck *Clerk) Get(key string) (string, rpc.Tversion, rpc.Err) {
+	// You will have to modify this function.
+	return "", 0, ""
+}
+
+// Put a key to a shard group.
+func (ck *Clerk) Put(key string, value string, version rpc.Tversion) rpc.Err {
+	// You will have to modify this function.
+	return ""
+}
--- a/src/shardkv1/shardcfg/shardcfg.go
+++ b/src/shardkv1/shardcfg/shardcfg.go
@ -0,0 +1,275 @@
+package shardcfg
+
+import (
+	"encoding/json"
+	"hash/fnv"
+	"log"
+	"runtime/debug"
+	"slices"
+	"testing"
+
+	"6.5840/tester1"
+)
+
+type Tshid int
+type Tnum int
+
+const (
+	NShards  = 12 // The number of shards.
+	NumFirst = Tnum(1)
+)
+
+const (
+	Gid1 = tester.Tgid(1)
+)
+
+// which shard is a key in?
+// please use this function,
+// and please do not change it.
+func Key2Shard(key string) Tshid {
+	h := fnv.New32a()
+	h.Write([]byte(key))
+	shard := Tshid(Tshid(h.Sum32()) % NShards)
+	return shard
+}
+
+// A configuration -- an assignment of shards to groups.
+// Please don't change this.
+type ShardConfig struct {
+	Num    Tnum                     // config number
+	Shards [NShards]tester.Tgid     // shard -> gid
+	Groups map[tester.Tgid][]string // gid -> servers[]
+}
+
+func MakeShardConfig() *ShardConfig {
+	c := &ShardConfig{
+		Groups: make(map[tester.Tgid][]string),
+	}
+	return c
+}
+
+func (cfg *ShardConfig) String() string {
+	b, err := json.Marshal(cfg)
+	if err != nil {
+		log.Fatalf("Unmarshall err %v", err)
+	}
+	return string(b)
+}
+
+func FromString(s string) *ShardConfig {
+	scfg := &ShardConfig{}
+	if err := json.Unmarshal([]byte(s), scfg); err != nil {
+		log.Fatalf("Unmarshall err %v", err)
+	}
+	return scfg
+}
+
+func (cfg *ShardConfig) Copy() *ShardConfig {
+	c := MakeShardConfig()
+	c.Num = cfg.Num
+	c.Shards = cfg.Shards
+	for k, srvs := range cfg.Groups {
+		s := make([]string, len(srvs))
+		copy(s, srvs)
+		c.Groups[k] = s
+	}
+	return c
+}
+
+// mostgroup, mostn, leastgroup, leastn
+func analyze(c *ShardConfig) (tester.Tgid, int, tester.Tgid, int) {
+	counts := map[tester.Tgid]int{}
+	for _, g := range c.Shards {
+		counts[g] += 1
+	}
+
+	mn := -1
+	var mg tester.Tgid = -1
+	ln := 257
+	var lg tester.Tgid = -1
+	// Enforce deterministic ordering, map iteration
+	// is randomized in go
+	groups := make([]tester.Tgid, len(c.Groups))
+	i := 0
+	for k := range c.Groups {
+		groups[i] = k
+		i++
+	}
+	slices.Sort(groups)
+	for _, g := range groups {
+		if counts[g] < ln {
+			ln = counts[g]
+			lg = g
+		}
+		if counts[g] > mn {
+			mn = counts[g]
+			mg = g
+		}
+	}
+
+	return mg, mn, lg, ln
+}
+
+// return GID of group with least number of
+// assigned shards.
+func least(c *ShardConfig) tester.Tgid {
+	_, _, lg, _ := analyze(c)
+	return lg
+}
+
+// balance assignment of shards to groups.
+// modifies c.
+func (c *ShardConfig) Rebalance() {
+	// if no groups, un-assign all shards
+	if len(c.Groups) < 1 {
+		for s, _ := range c.Shards {
+			c.Shards[s] = 0
+		}
+		return
+	}
+
+	// assign all unassigned shards
+	for s, g := range c.Shards {
+		_, ok := c.Groups[g]
+		if ok == false {
+			lg := least(c)
+			c.Shards[s] = lg
+		}
+	}
+
+	// move shards from most to least heavily loaded
+	for {
+		mg, mn, lg, ln := analyze(c)
+		if mn < ln+2 {
+			break
+		}
+		// move 1 shard from mg to lg
+		for s, g := range c.Shards {
+			if g == mg {
+				c.Shards[s] = lg
+				break
+			}
+		}
+	}
+}
+
+func (cfg *ShardConfig) Join(servers map[tester.Tgid][]string) {
+	changed := false
+	for gid, servers := range servers {
+		_, ok := cfg.Groups[gid]
+		if ok {
+			log.Fatalf("re-Join %v", gid)
+		}
+		for xgid, xservers := range cfg.Groups {
+			for _, s1 := range xservers {
+				for _, s2 := range servers {
+					if s1 == s2 {
+						log.Fatalf("Join(%v) puts server %v in groups %v and %v", gid, s1, xgid, gid)
+					}
+				}
+			}
+		}
+		// new GID
+		// modify cfg to reflect the Join()
+		cfg.Groups[gid] = servers
+		changed = true
+	}
+	if changed == false {
+		log.Fatalf("Join but no change")
+	}
+	cfg.Num += 1
+}
+
+func (cfg *ShardConfig) Leave(gids []tester.Tgid) {
+	changed := false
+	for _, gid := range gids {
+		_, ok := cfg.Groups[gid]
+		if ok == false {
+			// already no GID!
+			debug.PrintStack()
+			log.Fatalf("Leave(%v) but not in config", gid)
+		} else {
+			// modify op.Config to reflect the Leave()
+			delete(cfg.Groups, gid)
+			changed = true
+		}
+	}
+	if changed == false {
+		debug.PrintStack()
+		log.Fatalf("Leave but no change")
+	}
+	cfg.Num += 1
+}
+
+func (cfg *ShardConfig) JoinBalance(servers map[tester.Tgid][]string) {
+	cfg.Join(servers)
+	cfg.Rebalance()
+}
+
+func (cfg *ShardConfig) LeaveBalance(gids []tester.Tgid) {
+	cfg.Leave(gids)
+	cfg.Rebalance()
+}
+
+func (cfg *ShardConfig) GidServers(sh Tshid) (tester.Tgid, []string, bool) {
+	gid := cfg.Shards[sh]
+	srvs, ok := cfg.Groups[gid]
+	return gid, srvs, ok
+}
+
+func (cfg *ShardConfig) IsMember(gid tester.Tgid) bool {
+	for _, g := range cfg.Shards {
+		if g == gid {
+			return true
+		}
+	}
+	return false
+}
+
+func (cfg *ShardConfig) CheckConfig(t *testing.T, groups []tester.Tgid) {
+	if len(cfg.Groups) != len(groups) {
+		fatalf(t, "wanted %v groups, got %v", len(groups), len(cfg.Groups))
+	}
+
+	// are the groups as expected?
+	for _, g := range groups {
+		_, ok := cfg.Groups[g]
+		if ok != true {
+			fatalf(t, "missing group %v", g)
+		}
+	}
+
+	// any un-allocated shards?
+	if len(groups) > 0 {
+		for s, g := range cfg.Shards {
+			_, ok := cfg.Groups[g]
+			if ok == false {
+				fatalf(t, "shard %v -> invalid group %v", s, g)
+			}
+		}
+	}
+
+	// more or less balanced sharding?
+	counts := map[tester.Tgid]int{}
+	for _, g := range cfg.Shards {
+		counts[g] += 1
+	}
+	min := 257
+	max := 0
+	for g, _ := range cfg.Groups {
+		if counts[g] > max {
+			max = counts[g]
+		}
+		if counts[g] < min {
+			min = counts[g]
+		}
+	}
+	if max > min+1 {
+		fatalf(t, "max %v too much larger than min %v", max, min)
+	}
+}
+
+func fatalf(t *testing.T, format string, args ...any) {
+	debug.PrintStack()
+	t.Fatalf(format, args...)
+}
--- a/src/shardkv1/shardcfg/shardcfg_test.go
+++ b/src/shardkv1/shardcfg/shardcfg_test.go
@ -0,0 +1,62 @@
+package shardcfg
+
+import (
+	"testing"
+
+	"6.5840/kvtest1"
+)
+
+func check_same_config(t *testing.T, c1 ShardConfig, c2 ShardConfig) {
+	if c1.Num != c2.Num {
+		t.Fatalf("Num wrong")
+	}
+	if c1.Shards != c2.Shards {
+		t.Fatalf("Shards wrong")
+	}
+	if len(c1.Groups) != len(c2.Groups) {
+		t.Fatalf("number of Groups is wrong")
+	}
+	for gid, sa := range c1.Groups {
+		sa1, ok := c2.Groups[gid]
+		if ok == false || len(sa1) != len(sa) {
+			t.Fatalf("len(Groups) wrong")
+		}
+		if ok && len(sa1) == len(sa) {
+			for j := 0; j < len(sa); j++ {
+				if sa[j] != sa1[j] {
+					t.Fatalf("Groups wrong")
+				}
+			}
+		}
+	}
+}
+
+func TestBasic(t *testing.T) {
+	const (
+		Gid1 = 1
+		Gid2 = 2
+	)
+	cfg := MakeShardConfig()
+	cfg.CheckConfig(t, []kvtest.Tgid{})
+
+	cfg.JoinBalance(map[kvtest.Tgid][]string{Gid1: []string{"x", "y", "z"}})
+	cfg.CheckConfig(t, []kvtest.Tgid{Gid1})
+
+	cfg.JoinBalance(map[kvtest.Tgid][]string{Gid2: []string{"a", "b", "c"}})
+	cfg.CheckConfig(t, []kvtest.Tgid{Gid1, Gid2})
+
+	sa1 := cfg.Groups[Gid1]
+	if len(sa1) != 3 || sa1[0] != "x" || sa1[1] != "y" || sa1[2] != "z" {
+		t.Fatalf("wrong servers for gid %v: %v\n", Gid1, sa1)
+	}
+	sa2 := cfg.Groups[Gid2]
+	if len(sa2) != 3 || sa2[0] != "a" || sa2[1] != "b" || sa2[2] != "c" {
+		t.Fatalf("wrong servers for gid %v: %v\n", Gid2, sa2)
+	}
+
+	cfg.LeaveBalance([]kvtest.Tgid{Gid1})
+	cfg.CheckConfig(t, []kvtest.Tgid{Gid2})
+
+	cfg.LeaveBalance([]kvtest.Tgid{Gid2})
+	cfg.CheckConfig(t, []kvtest.Tgid{})
+}
--- a/src/shardkv1/shardctrler/client.go
+++ b/src/shardkv1/shardctrler/client.go
@ -0,0 +1,49 @@
+package shardctrler
+
+import (
+	// "log"
+	"sync/atomic"
+
+	"6.5840/kvsrv1/rpc"
+	"6.5840/tester1"
+)
+
+type Clerk struct {
+	clnt    *tester.Clnt
+	servers []string
+	deposed *int32
+	// You will have to modify this struct.
+}
+
+// The shard controller can use MakeClerk to make a clerk for the kvraft
+// group with the servers `servers`.
+func MakeClerk(clnt *tester.Clnt, servers []string, deposed *int32) *Clerk {
+	ck := &Clerk{clnt: clnt, servers: servers, deposed: deposed}
+	// You may add code here.
+	return ck
+}
+
+func (ck *Clerk) isDeposed() bool {
+	z := atomic.LoadInt32(ck.deposed)
+	return z == 1
+}
+
+// You can reuse your kvraft Get
+func (ck *Clerk) Get(key string) (string, rpc.Tversion, rpc.Err) {
+	args := rpc.GetArgs{}
+	args.Key = key
+
+	// You'll have to add code here.
+	return "", 0, ""
+}
+
+// You can reuse your kvraft Put
+func (ck *Clerk) Put(key string, value string, version rpc.Tversion) rpc.Err {
+	args := rpc.PutArgs{}
+	args.Key = key
+	args.Value = value
+	args.Version = version
+
+	// You'll have to add code here.
+	return ""
+}
--- a/src/shardkv1/shardctrler/lock/lock.go
+++ b/src/shardkv1/shardctrler/lock/lock.go
@ -0,0 +1,58 @@
+package lock
+
+import (
+	"log"
+	"time"
+
+	"6.5840/kvsrv1/rpc"
+	"6.5840/kvtest1"
+)
+
+type Lock struct {
+	kvtest.IKVClerk
+	l   string
+	id  string
+	ver rpc.Tversion
+}
+
+func MakeLock(ck kvtest.IKVClerk, l string) *Lock {
+	lk := &Lock{IKVClerk: ck}
+	// You may add core here
+	return lk
+}
+
+func (lk *Lock) AcquireLeadership() {
+	for {
+		if val, ver, err := lk.Get(lk.l); err == rpc.OK {
+			if val == "" { // put only when lock is free
+				if err := lk.Put(lk.l, lk.id, ver); err == rpc.OK {
+					lk.ver = ver + 1
+					return
+				} else if err == rpc.ErrMaybe { // check if put succeeded?
+					if val, ver, err := lk.Get(lk.l); err == rpc.OK {
+						if val == lk.id {
+							lk.ver = ver
+							return
+						}
+					}
+				}
+			}
+			time.Sleep(1 * time.Millisecond)
+		}
+	}
+}
+
+// for two testing purposes: 1) for the ctrler that is a leader to
+// give up its leadership; 2) to take back leadership from a
+// partitioned/deposed ctrler using a new ctrler.
+func (lk *Lock) ReleaseLeadership() rpc.Err {
+	_, ver, err := lk.Get(lk.l)
+	if err != rpc.OK {
+		log.Printf("ResetLock: %v err %v", lk.l, err)
+	}
+	if err := lk.Put(lk.l, "", ver); err == rpc.OK || err == rpc.ErrMaybe {
+		return rpc.OK
+	} else {
+		return err
+	}
+}
--- a/src/shardkv1/shardctrler/shardctrler.go
+++ b/src/shardkv1/shardctrler/shardctrler.go
@ -0,0 +1,115 @@
+package shardctrler
+
+//
+// Shardctrler implemented as a clerk.
+//
+
+import (
+
+	"sync/atomic"
+
+	"6.5840/kvraft1"
+	"6.5840/kvsrv1/rpc"
+	"6.5840/kvtest1"
+	"6.5840/shardkv1/shardcfg"
+	"6.5840/tester1"
+)
+
+const (
+	ErrDeposed = "ErrDeposed"
+)
+
+
+// The query clerk must support only Query(); it is intended for use
+// by shardkv clerks to read the current configuration (see
+// ../client.go).
+type QueryClerk struct {
+	kvtest.IKVClerk
+	// Your data here.
+}
+
+// Make a query clerk for controller's kvraft group to invoke just
+// Query()
+func MakeQueryClerk(clnt *tester.Clnt, servers []string) *QueryClerk {
+	qck := &QueryClerk{
+		IKVClerk: kvraft.MakeClerk(clnt, servers),
+	}
+	// Your code here.
+	return qck
+}
+
+// Return the current configuration.  You can use Get() to retrieve
+// the string representing the configuration and shardcfg.ToShardCfg
+// to unmarshal the string into a ShardConfig.
+func (qck *QueryClerk) Query() (*shardcfg.ShardConfig, rpc.Tversion) {
+	// Your code here.
+	return nil, 0
+}
+
+// ShardCtrlerClerk for the shard controller. It implements the
+// methods for Init(), Join(), Leave(), etc.
+type ShardCtrlerClerk struct {
+	clnt    *tester.Clnt
+	deposed int32 // set by Stepdown()
+
+	// Your data here.
+}
+
+// Make a ShardCltlerClerk for the shard controller, which stores its
+// state in a kvraft group.  You can call (and implement) the
+// MakeClerk method in client.go to make a kvraft clerk for the kvraft
+// group with the servers `servers`.
+func MakeShardCtrlerClerk(clnt *tester.Clnt, servers []string) *ShardCtrlerClerk {
+	sck := &ShardCtrlerClerk{clnt: clnt}
+	// Your code here.
+	return sck
+}
+
+
+// Called once by the tester to supply the first configuration.  You
+// can marshal ShardConfig into a string using shardcfg.String(), and
+// then Put it in the kvraft group for the controller at version 0.
+// You can pick the key to name the configuration.
+func (sck *ShardCtrlerClerk) Init(cfg *shardcfg.ShardConfig) rpc.Err {
+	// Your code here
+	return rpc.OK
+}
+
+// Add group gid. Use shardcfg.JoinBalance() to compute the new
+// configuration; the supplied `srvrs` are the servers for the new
+// group.  You can find the servers for existing groups in the
+// configuration (which you can retrieve using Query()) and you can
+// make a clerk for a group by calling shardgrp.MakeClerk(sck.clnt,
+// servers), and then invoke its Freeze/InstallShard methods.
+func (sck *ShardCtrlerClerk) Join(gid tester.Tgid, srvs []string) rpc.Err {
+	// Your code here
+	return rpc.ErrNoKey
+}
+
+// Group gid leaves. You can use shardcfg.LeaveBalance() to compute
+// the new configuration.
+func (sck *ShardCtrlerClerk) Leave(gid tester.Tgid) rpc.Err {
+	// Your code here
+	return rpc.ErrNoKey
+}
+
+// the tester calls Stepdown() to force a ctrler to step down while it
+// is perhaps in the middle of a join/move. for your convenience, we
+// also supply isDeposed() method to test rf.dead in long-running
+// loops
+func (sck *ShardCtrlerClerk) Stepdown() {
+	atomic.StoreInt32(&sck.deposed, 1)
+}
+
+func (sck *ShardCtrlerClerk) isDeposed() bool {
+	z := atomic.LoadInt32(&sck.deposed)
+	return z == 1
+}
+
+
+// Return the current configuration
+func (sck *ShardCtrlerClerk) Query() (*shardcfg.ShardConfig, rpc.Tversion, rpc.Err) {
+	// Your code here.
+	return nil, 0, ""
+}
+
--- a/src/shardkv1/shardgrp/client.go
+++ b/src/shardkv1/shardgrp/client.go
@ -0,0 +1,38 @@
+package shardgrp
+
+import (
+
+
+	"6.5840/kvsrv1/rpc"
+	"6.5840/shardkv1/shardcfg"
+	"6.5840/tester1"
+)
+
+type Clerk struct {
+	clnt    *tester.Clnt
+	servers []string
+	leader  int // last successful leader (index into servers[])
+}
+
+func MakeClerk(clnt *tester.Clnt, servers []string) *Clerk {
+	ck := &Clerk{clnt: clnt, servers: servers}
+	return ck
+}
+
+func (ck *Clerk) Get(cid shardcfg.Tnum, key string, n shardcfg.Tnum) (string, rpc.Tversion, rpc.Err) {
+	// Your code here
+	return "", 0, ""
+}
+
+func (ck *Clerk) Put(key string, value string, version rpc.Tversion, n shardcfg.Tnum) (bool, rpc.Err) {
+	// Your code here
+	return false, ""
+}
+
+func (ck *Clerk) Freeze(s shardcfg.Tshid, num shardcfg.Tnum) ([]byte, rpc.Err) {
+	return nil, ""
+}
+
+func (ck *Clerk) InstallShard(s shardcfg.Tshid, state []byte, num shardcfg.Tnum) rpc.Err {
+	return ""
+}
--- a/src/shardkv1/shardgrp/server.go
+++ b/src/shardkv1/shardgrp/server.go
@ -0,0 +1,99 @@
+package shardgrp
+
+import (
+	"sync/atomic"
+
+
+	"6.5840/kvraft1/rsm"
+	"6.5840/kvsrv1/rpc"
+	"6.5840/labgob"
+	"6.5840/labrpc"
+	"6.5840/raft"
+	"6.5840/shardkv1/shardgrp/shardrpc"
+)
+
+
+type KVServer struct {
+	gid  tester.Tgid
+	me   int
+	dead int32 // set by Kill()
+	rsm  *rsm.RSM
+
+}
+
+
+func (kv *KVServer) DoOp(req any) any {
+	// Your code here
+	return nil
+}
+
+
+func (kv *KVServer) Snapshot() []byte {
+	// Your code here
+	return nil
+}
+
+func (kv *KVServer) Restore(data []byte) {
+	// Your code here
+}
+
+func (kv *KVServer) Get(args *shardrpc.GetArgs, reply *rpc.GetReply) {
+	// Your code here
+}
+
+func (kv *KVServer) Put(args *shardrpc.PutArgs, reply *rpc.PutReply) {
+	// Your code here
+}
+
+// Freeze the specified shard (i.e., reject future Get/Puts for this
+// shard) and return the key/values stored in that shard.
+func (kv *KVServer) Freeze(args *shardrpc.FreezeArgs, reply *shardrpc.FreezeReply) {
+	// Your code here
+}
+
+// Install the supplied state for the specified shard.
+func (kv *KVServer) InstallShard(args *shardrpc.InstallShardArgs, reply *shardrpc.InstallShardReply) {
+	// Your code here
+}
+
+// the tester calls Kill() when a KVServer instance won't
+// be needed again. for your convenience, we supply
+// code to set rf.dead (without needing a lock),
+// and a killed() method to test rf.dead in
+// long-running loops. you can also add your own
+// code to Kill(). you're not required to do anything
+// about this, but it may be convenient (for example)
+// to suppress debug output from a Kill()ed instance.
+func (kv *KVServer) Kill() {
+	atomic.StoreInt32(&kv.dead, 1)
+	// Your code here, if desired.
+}
+
+// Return kv's raft struct
+func (kv *KVServer) Raft() *raft.Raft {
+	return kv.rsm.Raft()
+}
+
+func (kv *KVServer) killed() bool {
+	z := atomic.LoadInt32(&kv.dead)
+	return z == 1
+}
+
+// StartKVServer() and MakeRSM() must return quickly, so they should
+// start goroutines for any long-running work.
+func StartKVServer(servers []*labrpc.ClientEnd, gid tester.Tgid, me int, persister *raft.Persister, maxraftstate int) tester.IKVServer {
+	// call labgob.Register on structures you want
+	// Go's RPC library to marshall/unmarshall.
+	labgob.Register(shardrpc.PutArgs{})
+	labgob.Register(shardrpc.GetArgs{})
+	labgob.Register(shardrpc.FreezeArgs{})
+	labgob.Register(shardrpc.InstallShardArgs{})
+	labgob.Register(shardrpc.DeleteShardArgs{})
+	labgob.Register(rsm.Op{})
+
+	kv := &KVServer{gid: gid, me: me}
+	kv.rsm = rsm.MakeRSM(servers, me, persister, maxraftstate, kv)
+
+	// Your code here
+	return kv
+}
--- a/src/shardkv1/shardgrp/shardrpc/shardrpc.go
+++ b/src/shardkv1/shardgrp/shardrpc/shardrpc.go
@ -0,0 +1,50 @@
+package shardrpc
+
+import (
+	"6.5840/kvsrv1/rpc"
+	"6.5840/shardkv1/shardcfg"
+)
+
+// Same as Put in kvsrv1/rpc, but with a configuration number.
+type PutArgs struct {
+	Key     string
+	Value   string
+	Version rpc.Tversion
+	Num     shardcfg.Tnum
+}
+
+// Same as Get in kvsrv1/rpc, but with a configuration number.
+type GetArgs struct {
+	Key string
+	Num shardcfg.Tnum
+}
+
+type FreezeArgs struct {
+	Shard shardcfg.Tshid
+	Num   shardcfg.Tnum
+}
+
+type FreezeReply struct {
+	State []byte
+	Num   shardcfg.Tnum
+	Err   rpc.Err
+}
+
+type InstallShardArgs struct {
+	Shard shardcfg.Tshid
+	State []byte
+	Num   shardcfg.Tnum
+}
+
+type InstallShardReply struct {
+	Err rpc.Err
+}
+
+type DeleteShardArgs struct {
+	Shard shardcfg.Tshid
+	Num   shardcfg.Tnum
+}
+
+type DeleteShardReply struct {
+	Err rpc.Err
+}
--- a/src/shardkv1/shardkv_test.go
+++ b/src/shardkv1/shardkv_test.go
@ -0,0 +1,304 @@
+package shardkv
+
+import (
+	"log"
+	"testing"
+	"time"
+
+	"6.5840/kvsrv1/rpc"
+	"6.5840/kvtest1"
+	"6.5840/shardkv1/shardcfg"
+	"6.5840/tester1"
+	// "6.5840/shardkv1/shardctrler"
+)
+
+const (
+	NGRP = 8
+)
+
+// Setup a k/v service with 1 shardgrp (group 0) for storing the
+// controller to store its state and 1 shardgrp (group 1) to store all
+// shards.  Test's controller's Init() and Query(), and shardkv's
+// Get/Put without reconfiguration.
+func TestStaticOneShardGroup5A(t *testing.T) {
+	ts := MakeTest(t, "Test (5A): one shard group ...", true, false)
+	defer ts.Cleanup()
+
+	// The tester's setupKVService() sets up a kvraft group for the
+	// controller to store configurations and calls the controller's
+	// Init() method to create the first configuration.
+	ts.setupKVService()
+	sck := ts.ShardCtrler() // get the controller clerk from tester
+
+	// Read the initial configuration and check it
+	cfg, v, err := sck.Query()
+	if err != rpc.OK {
+		ts.Fatalf("Query failed %v", err)
+	}
+	if v != 1 || cfg.Num != 1 || cfg.Shards[0] != shardcfg.Gid1 {
+		ts.Fatalf("Static wrong %v %v", cfg, v)
+	}
+	cfg.CheckConfig(t, []tester.Tgid{shardcfg.Gid1})
+
+	ck := ts.MakeClerk()                          // make a shardkv clerk
+	ka, va := ts.SpreadPuts(ck, shardcfg.NShards) // do some puts
+	n := len(ka)
+	for i := 0; i < n; i++ {
+		ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1)) // check the puts
+	}
+}
+
+// test shardctrler's join, which adds a new group Gid2 and must move
+// shards to the new group and the old group should reject Get/Puts on
+// shards that moved.
+func TestJoinBasic5A(t *testing.T) {
+	ts := MakeTest(t, "Test (5A): a group joins...", true, false)
+	defer ts.Cleanup()
+
+	gid1 := ts.setupKVService()
+	ck := ts.MakeClerk()
+	ka, va := ts.SpreadPuts(ck, shardcfg.NShards)
+
+	sck := ts.ShardCtrler()
+	cfg, _, err := sck.Query()
+	if err != rpc.OK {
+		ts.t.Fatalf("Query: err %v", err)
+	}
+
+	gid2 := ts.newGid()
+	err = ts.joinGroups(sck, []tester.Tgid{gid2})
+	if err != rpc.OK {
+		ts.t.Fatalf("joinGroups: err %v", err)
+	}
+
+	cfg1, _, err := sck.Query()
+	if err != rpc.OK {
+		ts.t.Fatalf("Query 1: err %v", err)
+	}
+
+	if cfg.Num+1 != cfg1.Num {
+		ts.t.Fatalf("wrong num %d expected %d ", cfg1.Num, cfg.Num+1)
+	}
+
+	if !cfg1.IsMember(gid2) {
+		ts.t.Fatalf("%d isn't a member of %v", gid2, cfg1)
+	}
+
+	// check shards at shardcfg.Gid2
+	ts.checkShutdownSharding(gid1, gid2, ka, va)
+
+	for i := 0; i < len(ka); i++ {
+		ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1))
+	}
+
+	// check shards at shardcfg.Gid1
+	ts.checkShutdownSharding(gid2, gid1, ka, va)
+
+	for i := 0; i < len(ka); i++ {
+		ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1))
+	}
+}
+
+// test shardctrler's leave
+func TestJoinLeaveBasic5A(t *testing.T) {
+	ts := MakeTest(t, "Test (5A): basic groups join/leave ...", true, false)
+	defer ts.Cleanup()
+
+	gid1 := ts.setupKVService()
+	ck := ts.MakeClerk()
+	ka, va := ts.SpreadPuts(ck, shardcfg.NShards)
+
+	sck := ts.ShardCtrler()
+	gid2 := ts.newGid()
+	err := ts.joinGroups(sck, []tester.Tgid{gid2})
+	if err != rpc.OK {
+		ts.t.Fatalf("joinGroups: err %v", err)
+	}
+
+	// check shards at shardcfg.Gid2
+	ts.checkShutdownSharding(gid1, gid2, ka, va)
+
+	for i := 0; i < len(ka); i++ {
+		ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1))
+	}
+
+	err = sck.Leave(shardcfg.Gid1)
+	if err != rpc.OK {
+		ts.t.Fatalf("Leave: err %v", err)
+	}
+	cfg, _, err := sck.Query()
+	if err != rpc.OK {
+		ts.t.Fatalf("Query err %v", err)
+	}
+	if cfg.IsMember(shardcfg.Gid1) {
+		ts.t.Fatalf("%d is a member of %v", shardcfg.Gid1, cfg)
+	}
+
+	ts.Group(shardcfg.Gid1).Shutdown()
+
+	for i := 0; i < len(ka); i++ {
+		ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1))
+	}
+
+	// bring the crashed shard/group back to life.
+	ts.Group(shardcfg.Gid1).StartServers()
+
+	// Rejoin
+	sck.Join(shardcfg.Gid1, ts.Group(shardcfg.Gid1).SrvNames())
+
+	for i := 0; i < len(ka); i++ {
+		ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1))
+	}
+
+	// check shards at shardcfg.Gid2
+	ts.checkShutdownSharding(gid2, gid1, ka, va)
+}
+
+// test many groups joining and leaving, reliable or unreliable
+func joinLeave5A(t *testing.T, reliable bool, part string) {
+	ts := MakeTest(t, "Test (5A): many groups join/leave ...", reliable, false)
+	defer ts.Cleanup()
+
+	ts.setupKVService()
+	ck := ts.MakeClerk()
+	ka, va := ts.SpreadPuts(ck, shardcfg.NShards)
+
+	sck := ts.ShardCtrler()
+	grps := ts.groups(NGRP)
+
+	ts.joinGroups(sck, grps)
+
+	ts.checkShutdownSharding(grps[0], grps[1], ka, va)
+
+	for i := 0; i < len(ka); i++ {
+		ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1))
+	}
+
+	ts.leaveGroups(sck, grps)
+
+	for i := 0; i < len(ka); i++ {
+		ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1))
+	}
+}
+
+func TestManyJoinLeaveReliable5A(t *testing.T) {
+	joinLeave5A(t, true, "Test (5A): many groups join/leave reliable...")
+}
+
+func TestManyJoinLeaveUnreliable5A(t *testing.T) {
+	joinLeave5A(t, false, "Test (5A): many groups join/leave unreliable...")
+}
+
+// Test we can recover from complete shutdown using snapshots
+func TestSnapshot5A(t *testing.T) {
+	const NGRP = 3
+
+	ts := MakeTest(t, "Test (5A): snapshots ...", true, false)
+	defer ts.Cleanup()
+
+	ts.setupKVService()
+	ck := ts.MakeClerk()
+	ka, va := ts.SpreadPuts(ck, shardcfg.NShards)
+
+	sck := ts.ShardCtrler()
+	grps := ts.groups(2)
+	ts.joinGroups(sck, grps)
+
+	// check shards at shardcfg.Gid2
+	ts.checkShutdownSharding(grps[0], grps[1], ka, va)
+
+	for i := 0; i < len(ka); i++ {
+		ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1))
+	}
+
+	for i := tester.Tgid(0); i < NGRP; i++ {
+		ts.Group(shardcfg.Gid1).Shutdown()
+	}
+	for i := tester.Tgid(0); i < NGRP; i++ {
+		ts.Group(shardcfg.Gid1).StartServers()
+	}
+
+	for i := 0; i < len(ka); i++ {
+		ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1))
+	}
+}
+
+// Test linearizability with groups joining/leaving and `nclnt`
+// concurrent clerks put/get's in `unreliable` net.
+func concurrentClerk(t *testing.T, nclnt int, reliable bool, part string) {
+	const (
+		NSEC = 20
+	)
+
+	ts := MakeTest(t, part, reliable, true)
+	defer ts.Cleanup()
+
+	ts.setupKVService()
+
+	ka := kvtest.MakeKeys(shardcfg.NShards)
+	ch := make(chan []kvtest.ClntRes)
+
+	start := time.Now()
+
+	go func(ch chan []kvtest.ClntRes) {
+		rs := ts.SpawnClientsAndWait(nclnt, NSEC*time.Second, func(me int, ck kvtest.IKVClerk, done chan struct{}) kvtest.ClntRes {
+			return ts.OneClientPut(me, ck, ka, done)
+		})
+		ch <- rs
+	}(ch)
+
+	sck := ts.ShardCtrler()
+	grps := ts.groups(NGRP)
+	ts.joinGroups(sck, grps)
+
+	ts.leaveGroups(sck, grps)
+
+	log.Printf("time joining/leaving %v", time.Since(start))
+
+	rsa := <-ch
+
+	log.Printf("rsa %v", rsa)
+
+	ts.CheckPorcupine()
+}
+
+// Test linearizability with groups joining/leaving and 1 concurrent clerks put/get's
+func TestOneConcurrentClerkReliable5A(t *testing.T) {
+	concurrentClerk(t, 1, true, "Test (5A): one concurrent clerk reliable...")
+}
+
+// Test linearizability with groups joining/leaving and many concurrent clerks put/get's
+func TestManyConcurrentClerkReliable5A(t *testing.T) {
+	const NCLNT = 10
+	concurrentClerk(t, NCLNT, true, "Test (5A): many concurrent clerks reliable...")
+}
+
+// Test linearizability with groups joining/leaving and 1 concurrent clerks put/get's
+func TestOneConcurrentClerkUnreliable5A(t *testing.T) {
+	concurrentClerk(t, 1, false, "Test (5A): one concurrent clerk unreliable ...")
+}
+
+// Test linearizability with groups joining/leaving and many concurrent clerks put/get's
+func TestManyConcurrentClerkUnreliable5A(t *testing.T) {
+	const NCLNT = 10
+	concurrentClerk(t, NCLNT, false, "Test (5A): many concurrent clerks unreliable...")
+}
+
+// test recovery of partitioned controlers
+func TestRecoverCtrler5B(t *testing.T) {
+	const (
+		NPARITITON = 10
+	)
+
+	ts := MakeTest(t, "Test (5B): recover controler ...", true, false)
+	defer ts.Cleanup()
+
+	ts.setupKVService()
+	ck := ts.MakeClerk()
+	ka, va := ts.SpreadPuts(ck, shardcfg.NShards)
+
+	for i := 0; i < NPARITITON; i++ {
+		ts.partitionCtrler(ck, ka, va)
+	}
+}
+
--- a/src/shardkv1/test.go
+++ b/src/shardkv1/test.go
@ -0,0 +1,303 @@
+package shardkv
+
+import (
+	"fmt"
+	"log"
+	"math/rand"
+	"sync"
+	"testing"
+	"time"
+
+	"6.5840/kvraft1"
+	"6.5840/kvsrv1/rpc"
+	"6.5840/kvtest1"
+	"6.5840/labrpc"
+	"6.5840/shardkv1/shardcfg"
+	"6.5840/shardkv1/shardctrler"
+	"6.5840/shardkv1/shardgrp"
+	"6.5840/tester1"
+)
+
+type Test struct {
+	t *testing.T
+	*kvtest.Test
+
+	sck  *shardctrler.ShardCtrlerClerk
+	part string
+
+	mu   sync.Mutex
+	ngid tester.Tgid
+}
+
+const (
+	Controler     = tester.Tgid(0) // controler uses group 0 for a kvraft group
+	NSRV          = 3              // servers per group
+	INTERGRPDELAY = 200            // time in ms between group changes
+)
+
+// Setup a kvraft group (group 0) for the shard controller and make
+// the controller clerk.
+func MakeTest(t *testing.T, part string, reliable, randomkeys bool) *Test {
+	cfg := tester.MakeConfig(t, NSRV, reliable, -1, kvraft.StartKVServer)
+	ts := &Test{
+		ngid: shardcfg.Gid1 + 1, // Gid1 is in use
+		t:    t,
+	}
+	ts.Test = kvtest.MakeTest(t, cfg, randomkeys, ts)
+	ts.sck = ts.makeShardCtrlerClerk()
+	ts.Begin(part)
+	return ts
+}
+
+func (ts *Test) MakeClerk() kvtest.IKVClerk {
+	clnt := ts.Config.MakeClient()
+	ck := MakeClerk(clnt, ts.makeQueryClerk())
+	return &kvtest.TestClerk{ck, clnt}
+}
+
+func (ts *Test) DeleteClerk(ck kvtest.IKVClerk) {
+	tck := ck.(*kvtest.TestClerk)
+	ts.DeleteClient(tck.Clnt)
+}
+
+func (ts *Test) ShardCtrler() *shardctrler.ShardCtrlerClerk {
+	return ts.sck
+}
+
+func (ts *Test) makeShardCtrlerClerk() *shardctrler.ShardCtrlerClerk {
+	ck, _ := ts.makeShardCtrlerClerkClnt()
+	return ck
+}
+
+func (ts *Test) makeShardCtrlerClerkClnt() (*shardctrler.ShardCtrlerClerk, *tester.Clnt) {
+	srvs := ts.Group(Controler).SrvNames()
+	clnt := ts.Config.MakeClient()
+	return shardctrler.MakeShardCtrlerClerk(clnt, srvs), clnt
+}
+
+func (ts *Test) makeQueryClerk() *shardctrler.QueryClerk {
+	srvs := ts.Group(Controler).SrvNames()
+	clnt := ts.Config.MakeClient()
+	return shardctrler.MakeQueryClerk(clnt, srvs)
+}
+
+func (ts *Test) newGid() tester.Tgid {
+	ts.mu.Lock()
+	defer ts.mu.Unlock()
+
+	gid := ts.ngid
+	ts.ngid += 1
+	return gid
+}
+
+func (ts *Test) groups(n int) []tester.Tgid {
+	grps := make([]tester.Tgid, n)
+	for i := 0; i < n; i++ {
+		grps[i] = ts.newGid()
+	}
+	return grps
+}
+
+// Set up KVServervice with one group Gid1. Gid1 should initialize
+// itself to own all shards.
+func (ts *Test) setupKVService() tester.Tgid {
+	scfg := shardcfg.MakeShardConfig()
+	ts.Config.MakeGroupStart(shardcfg.Gid1, NSRV, -1, shardgrp.StartKVServer)
+	scfg.JoinBalance(map[tester.Tgid][]string{shardcfg.Gid1: ts.Group(shardcfg.Gid1).SrvNames()})
+	if err := ts.sck.Init(scfg); err != rpc.OK {
+		ts.t.Fatalf("Init err %v", err)
+	}
+	//ts.sck.AcquireLeadership()
+	return shardcfg.Gid1
+}
+
+func (ts *Test) joinGroups(sck *shardctrler.ShardCtrlerClerk, gids []tester.Tgid) rpc.Err {
+	for i, gid := range gids {
+		ts.Config.MakeGroupStart(gid, NSRV, -1, shardgrp.StartKVServer)
+		if err := sck.Join(gid, ts.Group(gid).SrvNames()); err != rpc.OK {
+			return err
+		}
+		if i < len(gids)-1 {
+			time.Sleep(INTERGRPDELAY * time.Millisecond)
+		}
+	}
+	return rpc.OK
+}
+
+func (ts *Test) leaveGroups(sck *shardctrler.ShardCtrlerClerk, gids []tester.Tgid) rpc.Err {
+	for i, gid := range gids {
+		if err := sck.Leave(gid); err != rpc.OK {
+			return err
+		}
+		ts.Config.ExitGroup(gid)
+		if i < len(gids)-1 {
+			time.Sleep(INTERGRPDELAY * time.Millisecond)
+		}
+	}
+	return rpc.OK
+}
+
+func (ts *Test) checkLogs(gids []tester.Tgid) {
+	for _, gid := range gids {
+		n := ts.Group(gid).LogSize()
+		s := ts.Group(gid).SnapshotSize()
+		if ts.Group(gid).Maxraftstate >= 0 && n > 8*ts.Group(gid).Maxraftstate {
+			ts.t.Fatalf("persister.RaftStateSize() %v, but maxraftstate %v",
+				n, ts.Group(gid).Maxraftstate)
+		}
+		if ts.Group(gid).Maxraftstate < 0 && s > 0 {
+			ts.t.Fatalf("maxraftstate is -1, but snapshot is non-empty!")
+		}
+
+	}
+}
+
+// make sure that the data really is sharded by
+// shutting down one shard and checking that some
+// Get()s don't succeed.
+func (ts *Test) checkShutdownSharding(down, up tester.Tgid, ka []string, va []string) {
+	const NSEC = 2
+
+	ts.Group(down).Shutdown()
+
+	ts.checkLogs([]tester.Tgid{down, up}) // forbid snapshots
+
+	n := len(ka)
+	ch := make(chan string)
+	for xi := 0; xi < n; xi++ {
+		ck1 := ts.MakeClerk()
+		go func(i int) {
+			v, _, _ := ck1.Get(ka[i])
+			if v != va[i] {
+				ch <- fmt.Sprintf("Get(%v): expected:\n%v\nreceived:\n%v", ka[i], va[i], v)
+			} else {
+				ch <- ""
+			}
+		}(xi)
+	}
+
+	// wait a bit, only about half the Gets should succeed.
+	ndone := 0
+	done := false
+	for done == false {
+		select {
+		case err := <-ch:
+			if err != "" {
+				ts.Fatalf(err)
+			}
+			ndone += 1
+		case <-time.After(time.Second * NSEC):
+			done = true
+			break
+		}
+	}
+
+	// log.Printf("%d completions out of %d with %d groups", ndone, n, ngrp)
+	if ndone >= n {
+		ts.Fatalf("expected less than %d completions with one shard dead\n", n)
+	}
+
+	// bring the crashed shard/group back to life.
+	ts.Group(down).StartServers()
+}
+
+// Run one controler and then partitioned it forever after some time
+// Run another cntrler that must finish the first ctrler's unfinished
+// shard moves, if there are any.
+func (ts *Test) partitionCtrler(ck kvtest.IKVClerk, ka, va []string) {
+	const (
+		MSEC = 20
+		RAND = 2000 // maybe measure?
+	)
+
+	ch := make(chan tester.Tgid)
+
+	sck, clnt := ts.makeShardCtrlerClerkClnt()
+	cfg, _, err := ts.ShardCtrler().Query()
+	num := cfg.Num
+
+	go func() {
+		for true {
+			ngid := ts.newGid()
+			//log.Printf("join %d", ngid)
+			//s := time.Now()
+			ch <- ngid
+			err := ts.joinGroups(sck, []tester.Tgid{ngid})
+			if err == rpc.OK {
+				err = ts.leaveGroups(sck, []tester.Tgid{ngid})
+			}
+			//log.Printf("join err %v time %v", err, time.Since(s))
+			if err == shardctrler.ErrDeposed {
+				log.Printf("disposed")
+				return
+			}
+			if err != rpc.OK {
+				ts.t.Fatalf("join/leave err %v", err)
+			}
+			time.Sleep(INTERGRPDELAY * time.Millisecond)
+		}
+	}()
+
+	lastgid := <-ch
+
+	d := time.Duration(rand.Int()%RAND) * time.Millisecond
+	time.Sleep(MSEC*time.Millisecond + d)
+
+	log.Printf("disconnect sck %v", d)
+
+	// partition sck forever
+	clnt.DisconnectAll()
+
+	// force sck to step down
+	sck.Stepdown()
+
+	// wait until sck has no more requests in the network
+	time.Sleep(labrpc.MAXDELAY)
+
+	cfg, _, err = ts.ShardCtrler().Query()
+	if err != rpc.OK {
+		ts.Fatalf("Query err %v", err)
+	}
+
+	recovery := false
+	present := cfg.IsMember(lastgid)
+	join := num == cfg.Num
+	leave := num+1 == cfg.Num
+	if !present && join {
+		recovery = true
+	}
+	if present && leave {
+		recovery = true
+	}
+
+	// start new controler to pick up where sck left off
+	sck0, clnt0 := ts.makeShardCtrlerClerkClnt()
+	if err != rpc.OK {
+		ts.Fatalf("Query err %v", err)
+	}
+
+	cfg, _, err = sck0.Query()
+	if recovery {
+		s := "join"
+		if leave {
+			s = "leave"
+		}
+		//log.Printf("%v in progress", s)
+		present = cfg.IsMember(lastgid)
+		if (join && !present) || (leave && present) {
+			ts.Fatalf("didn't recover %d correctly after %v", lastgid, s)
+		}
+	}
+
+	if present {
+		// cleanup if disconnected after join but before leave
+		ts.leaveGroups(sck0, []tester.Tgid{lastgid})
+	}
+
+	for i := 0; i < len(ka); i++ {
+		ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1))
+	}
+
+	ts.Config.DeleteClient(clnt)
+	ts.Config.DeleteClient(clnt0)
+}
--- a/src/tester1/clnts.go
+++ b/src/tester1/clnts.go
@ -0,0 +1,153 @@
+package tester
+
+import (
+	//"log"
+	"os"
+	"sync"
+
+	"6.5840/labrpc"
+)
+
+type end struct {
+	name string
+	end  *labrpc.ClientEnd
+}
+
+// Servers are named by ServerName() and clerks lazily make a
+// per-clerk ClientEnd to a server.  Each clerk has a Clnt with a map
+// of the allocated ends for this clerk.
+type Clnt struct {
+	mu   sync.Mutex
+	net  *labrpc.Network
+	ends map[string]end
+
+	// if nil client can connect to all servers
+	// if len(srvs) = 0, client cannot connect to any servers
+	srvs []string
+}
+
+func makeClntTo(net *labrpc.Network, srvs []string) *Clnt {
+	return &Clnt{ends: make(map[string]end), net: net, srvs: srvs}
+}
+
+// caller must acquire lock
+func (clnt *Clnt) allowedL(server string) bool {
+	if clnt.srvs == nil {
+		return true
+	}
+	for _, n := range clnt.srvs {
+		if n == server {
+			return true
+		}
+	}
+	return false
+}
+
+func (clnt *Clnt) makeEnd(server string) end {
+	clnt.mu.Lock()
+	defer clnt.mu.Unlock()
+
+	if end, ok := clnt.ends[server]; ok {
+		return end
+	}
+
+	name := Randstring(20)
+	//log.Printf("%p: makEnd %v %v allowed %t", clnt, name, server, clnt.allowedL(server))
+	end := end{name: name, end: clnt.net.MakeEnd(name)}
+	clnt.net.Connect(name, server)
+	if clnt.allowedL(server) {
+		clnt.net.Enable(name, true)
+	} else {
+		clnt.net.Enable(name, false)
+	}
+	clnt.ends[server] = end
+	return end
+}
+
+func (clnt *Clnt) Call(server, method string, args interface{}, reply interface{}) bool {
+	end := clnt.makeEnd(server)
+	ok := end.end.Call(method, args, reply)
+	//log.Printf("%p: Call e %v m %v %v %v ok %v", clnt, end.name, method, args, reply, ok)
+	return ok
+}
+
+func (clnt *Clnt) ConnectAll() {
+	clnt.mu.Lock()
+	defer clnt.mu.Unlock()
+
+	for _, e := range clnt.ends {
+		// log.Printf("%p: ConnectAll: enable %v", clnt, e.name)
+		clnt.net.Enable(e.name, true)
+	}
+	clnt.srvs = nil
+}
+
+func (clnt *Clnt) DisconnectAll() {
+	clnt.mu.Lock()
+	defer clnt.mu.Unlock()
+
+	for _, e := range clnt.ends {
+		//log.Printf("%p: Disconnectall: disable %v", clnt, e.name)
+		clnt.net.Enable(e.name, false)
+	}
+	clnt.srvs = make([]string, 0)
+}
+
+func (clnt *Clnt) remove() {
+	clnt.mu.Lock()
+	defer clnt.mu.Unlock()
+
+	for _, e := range clnt.ends {
+		os.Remove(e.name)
+	}
+}
+
+type Clnts struct {
+	mu     sync.Mutex
+	net    *labrpc.Network
+	clerks map[*Clnt]struct{}
+}
+
+func makeClnts(net *labrpc.Network) *Clnts {
+	clnts := &Clnts{net: net, clerks: make(map[*Clnt]struct{})}
+	return clnts
+}
+
+func (clnts *Clnts) makeEnd(servername string) *labrpc.ClientEnd {
+	name := Randstring(20)
+	end := clnts.net.MakeEnd(name)
+	clnts.net.Connect(name, servername)
+	clnts.net.Enable(name, true)
+	return end
+}
+
+// Create a clnt for a clerk with specific server names, but allow
+// only connections to connections to servers in to[].
+func (clnts *Clnts) MakeClient() *Clnt {
+	return clnts.MakeClientTo(nil)
+}
+
+func (clnts *Clnts) MakeClientTo(srvs []string) *Clnt {
+	clnts.mu.Lock()
+	defer clnts.mu.Unlock()
+	clnt := makeClntTo(clnts.net, srvs)
+	clnts.clerks[clnt] = struct{}{}
+	return clnt
+}
+
+func (clnts *Clnts) cleanup() {
+	clnts.mu.Lock()
+	defer clnts.mu.Unlock()
+
+	for clnt, _ := range clnts.clerks {
+		clnt.remove()
+	}
+}
+
+func (clnts *Clnts) DeleteClient(clnt *Clnt) {
+	clnts.mu.Lock()
+	defer clnts.mu.Unlock()
+
+	clnt.remove()
+	delete(clnts.clerks, clnt)
+}
--- a/src/tester1/config.go
+++ b/src/tester1/config.go
@ -0,0 +1,180 @@
+package tester
+
+import (
+	crand "crypto/rand"
+	"encoding/base64"
+	"fmt"
+	// "log"
+	"math/big"
+	"math/rand"
+	"runtime"
+	"runtime/debug"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"6.5840/labrpc"
+	"6.5840/raft"
+)
+
+const GRP0 = 0
+
+type IKVServer interface {
+	Raft() *raft.Raft
+	Kill()
+}
+
+type Config struct {
+	*Clnts  // The clnts in the test
+	*Groups // The server groups in the test
+
+	t   *testing.T
+	net *labrpc.Network // The network shared by clnts and servers
+
+	start time.Time // time at which make_config() was called
+	// begin()/end() statistics
+	t0    time.Time // time at which test_test.go called cfg.begin()
+	rpcs0 int       // rpcTotal() at start of test
+	ops   int32     // number of clerk get/put/append method calls
+}
+
+func MakeConfig(t *testing.T, n int, reliable bool, maxraftstate int, mks FstartServer) *Config {
+	ncpu_once.Do(func() {
+		if runtime.NumCPU() < 2 {
+			fmt.Printf("warning: only one CPU, which may conceal locking bugs\n")
+		}
+		rand.Seed(makeSeed())
+	})
+	runtime.GOMAXPROCS(4)
+	cfg := &Config{}
+	cfg.t = t
+	cfg.net = labrpc.MakeNetwork()
+	cfg.Groups = newGroups(cfg.net)
+	cfg.MakeGroupStart(GRP0, n, maxraftstate, mks)
+	cfg.Clnts = makeClnts(cfg.net)
+	cfg.start = time.Now()
+
+	cfg.net.Reliable(reliable)
+
+	return cfg
+}
+
+func (cfg *Config) SetReliable(reliable bool) {
+	cfg.net.Reliable(reliable)
+}
+
+func (cfg *Config) IsReliable() bool {
+	return cfg.net.IsReliable()
+}
+
+func (cfg *Config) SetLongReordering(longrel bool) {
+	cfg.net.LongReordering(longrel)
+}
+
+func (cfg *Config) SetLongDelays(longdel bool) {
+	cfg.net.LongDelays(longdel)
+}
+
+func (cfg *Config) Group(gid Tgid) *ServerGrp {
+	return cfg.lookupGroup(gid)
+}
+
+func (cfg *Config) Cleanup() {
+	cfg.Clnts.cleanup()
+	cfg.Groups.cleanup()
+	cfg.net.Cleanup()
+	cfg.CheckTimeout()
+}
+
+func (cfg *Config) MakeGroupStart(gid Tgid, nsrv, maxraftstate int, mks FstartServer) {
+	cfg.MakeGroup(gid, nsrv, maxraftstate, mks)
+	cfg.Group(gid).StartServers()
+}
+
+func (cfg *Config) ExitGroup(gid Tgid) {
+	cfg.Group(gid).Shutdown()
+	cfg.Groups.delete(gid)
+}
+
+var ncpu_once sync.Once
+
+func (cfg *Config) RpcTotal() int {
+	return cfg.net.GetTotalCount()
+}
+
+func (cfg *Config) BytesTotal() int64 {
+	return cfg.net.GetTotalBytes()
+}
+
+// start a Test.
+// print the Test message.
+// e.g. cfg.begin("Test (2B): RPC counts aren't too high")
+func (cfg *Config) Begin(description string) {
+	rel := "reliable"
+	if !cfg.net.IsReliable() {
+		rel = "unreliable"
+	}
+	fmt.Printf("%s (%s network)...\n", description, rel)
+	cfg.t0 = time.Now()
+	cfg.rpcs0 = cfg.RpcTotal()
+	atomic.StoreInt32(&cfg.ops, 0)
+}
+
+func (cfg *Config) Op() {
+	atomic.AddInt32(&cfg.ops, 1)
+}
+
+// end a Test -- the fact that we got here means there
+// was no failure.
+// print the Passed message,
+// and some performance numbers.
+func (cfg *Config) End() {
+	cfg.CheckTimeout()
+	if cfg.t.Failed() == false {
+		t := time.Since(cfg.t0).Seconds()  // real time
+		npeers := cfg.Group(GRP0).N()      // number of Raft peers
+		nrpc := cfg.RpcTotal() - cfg.rpcs0 // number of RPC sends
+		ops := atomic.LoadInt32(&cfg.ops)  //  number of clerk get/put/append calls
+
+		fmt.Printf("  ... Passed --")
+		fmt.Printf("  %4.1f  %d %5d %4d\n", t, npeers, nrpc, ops)
+	}
+}
+
+func (cfg *Config) Fatalf(format string, args ...any) {
+	debug.PrintStack()
+	cfg.t.Fatalf(format, args...)
+}
+
+func Randstring(n int) string {
+	b := make([]byte, 2*n)
+	crand.Read(b)
+	s := base64.URLEncoding.EncodeToString(b)
+	return s[0:n]
+}
+
+func (cfg *Config) CheckTimeout() {
+	// enforce a two minute real-time limit on each test
+	if !cfg.t.Failed() && time.Since(cfg.start) > 120*time.Second {
+		cfg.t.Fatal("test took longer than 120 seconds")
+	}
+}
+
+func makeSeed() int64 {
+	max := big.NewInt(int64(1) << 62)
+	bigx, _ := crand.Int(crand.Reader, max)
+	x := bigx.Int64()
+	return x
+}
+
+// Randomize server handles
+func random_handles(kvh []*labrpc.ClientEnd) []*labrpc.ClientEnd {
+	sa := make([]*labrpc.ClientEnd, len(kvh))
+	copy(sa, kvh)
+	for i := range sa {
+		j := rand.Intn(i + 1)
+		sa[i], sa[j] = sa[j], sa[i]
+	}
+	return sa
+}
--- a/src/tester1/group.go
+++ b/src/tester1/group.go
@ -0,0 +1,306 @@
+package tester
+
+import (
+	//"log"
+	"strconv"
+	"sync"
+
+	"6.5840/labrpc"
+	"6.5840/raft"
+)
+
+type Tgid int
+
+type FstartServer func(ends []*labrpc.ClientEnd, grp Tgid, srv int, persister *raft.Persister, maxraftstate int) IKVServer
+
+// Each server has a name: i'th server of group gid. If there is only a single
+// server, it its gid = 0 and its i is 0.
+func ServerName(gid Tgid, i int) string {
+	return "server-" + strconv.Itoa(int(gid)) + "-" + strconv.Itoa(i)
+}
+
+// The tester may have many groups of servers (e.g., one per Raft group).
+// Groups are named 0, 1, and so on.
+type Groups struct {
+	mu   sync.Mutex
+	net  *labrpc.Network
+	grps map[Tgid]*ServerGrp
+}
+
+func newGroups(net *labrpc.Network) *Groups {
+	return &Groups{net: net, grps: make(map[Tgid]*ServerGrp)}
+}
+
+func (gs *Groups) MakeGroup(gid Tgid, nsrv, maxraftstate int, mks FstartServer) {
+	gs.mu.Lock()
+	defer gs.mu.Unlock()
+
+	gs.grps[gid] = makeSrvGrp(gs.net, gid, nsrv, maxraftstate, mks)
+}
+
+func (gs *Groups) lookupGroup(gid Tgid) *ServerGrp {
+	gs.mu.Lock()
+	defer gs.mu.Unlock()
+
+	return gs.grps[gid]
+}
+
+func (gs *Groups) delete(gid Tgid) {
+	gs.mu.Lock()
+	defer gs.mu.Unlock()
+
+	delete(gs.grps, gid)
+}
+
+func (gs *Groups) cleanup() {
+	gs.mu.Lock()
+	defer gs.mu.Unlock()
+
+	for _, sg := range gs.grps {
+		sg.cleanup()
+	}
+}
+
+type ServerGrp struct {
+	Maxraftstate int
+
+	net         *labrpc.Network
+	srvs        []*Server
+	servernames []string
+	gid         Tgid
+	connected   []bool // whether each server is on the net
+	mks         FstartServer
+}
+
+func makeSrvGrp(net *labrpc.Network, gid Tgid, n, m int, mks FstartServer) *ServerGrp {
+	sg := &ServerGrp{
+		Maxraftstate: m,
+		net:          net,
+		srvs:         make([]*Server, n),
+		gid:          gid,
+		connected:    make([]bool, n),
+		mks:          mks,
+	}
+	for i, _ := range sg.srvs {
+		sg.srvs[i] = makeServer(net, gid, n)
+	}
+	sg.servernames = make([]string, n)
+	for i := 0; i < n; i++ {
+		sg.servernames[i] = ServerName(gid, i)
+	}
+	return sg
+}
+
+func (sg *ServerGrp) N() int {
+	return len(sg.srvs)
+}
+
+func (sg *ServerGrp) SrvNames() []string {
+	return sg.servernames
+}
+
+func (sg *ServerGrp) SrvNamesTo(to []int) []string {
+	ns := make([]string, 0, len(to))
+	for _, i := range to {
+		ns = append(ns, sg.servernames[i])
+	}
+	return ns
+}
+
+func (sg *ServerGrp) all() []int {
+	all := make([]int, len(sg.srvs))
+	for i, _ := range sg.srvs {
+		all[i] = i
+	}
+	return all
+}
+
+func (sg *ServerGrp) ConnectAll() {
+	for i, _ := range sg.srvs {
+		sg.ConnectOne(i)
+	}
+}
+
+func (sg *ServerGrp) ConnectOne(i int) {
+	sg.connect(i, sg.all())
+}
+
+func (sg *ServerGrp) cleanup() {
+	for _, s := range sg.srvs {
+		if s.kvsrv != nil {
+			s.kvsrv.Kill()
+		}
+	}
+}
+
+// attach server i to servers listed in to
+// caller must hold cfg.mu
+func (sg *ServerGrp) connect(i int, to []int) {
+	//log.Printf("connect peer %d to %v\n", i, to)
+
+	sg.connected[i] = true
+
+	// outgoing socket files
+	sg.srvs[i].connect(to)
+
+	// incoming socket files
+	for j := 0; j < len(to); j++ {
+		if sg.IsConnected(j) {
+			endname := sg.srvs[to[j]].endNames[i]
+			sg.net.Enable(endname, true)
+		}
+	}
+}
+
+// detach server from the servers listed in from
+// caller must hold cfg.mu
+func (sg *ServerGrp) disconnect(i int, from []int) {
+	// log.Printf("%p: disconnect peer %d from %v\n", sg, i, from)
+
+	sg.connected[i] = false
+
+	// outgoing socket files
+	sg.srvs[i].disconnect(from)
+
+	// incoming socket files
+	for j := 0; j < len(from); j++ {
+		s := sg.srvs[from[j]]
+		if s.endNames != nil {
+			endname := s.endNames[i]
+			// log.Printf("%p: disconnect: %v", sg, endname)
+			sg.net.Enable(endname, false)
+		}
+	}
+}
+
+func (sg *ServerGrp) DisconnectAll(i int) {
+	sg.disconnect(i, sg.all())
+}
+
+func (sg *ServerGrp) IsConnected(i int) bool {
+	return sg.connected[i]
+}
+
+// Maximum log size across all servers
+func (sg *ServerGrp) LogSize() int {
+	logsize := 0
+	for _, s := range sg.srvs {
+		n := s.saved.RaftStateSize()
+		if n > logsize {
+			logsize = n
+		}
+	}
+	return logsize
+}
+
+// Maximum snapshot size across all servers
+func (sg *ServerGrp) SnapshotSize() int {
+	snapshotsize := 0
+	for _, s := range sg.srvs {
+		n := s.saved.SnapshotSize()
+		if n > snapshotsize {
+			snapshotsize = n
+		}
+	}
+	return snapshotsize
+}
+
+// If restart servers, first call shutdownserver
+func (sg *ServerGrp) StartServer(i int) {
+	srv := sg.srvs[i].startServer(sg.gid)
+	sg.srvs[i] = srv
+
+	srv.kvsrv = sg.mks(srv.clntEnds, sg.gid, i, srv.saved, sg.Maxraftstate)
+	kvsvc := labrpc.MakeService(srv.kvsrv)
+	labsrv := labrpc.MakeServer()
+	labsrv.AddService(kvsvc)
+	if len(sg.srvs) > 1 { // Run with raft?
+		rfsvc := labrpc.MakeService(srv.kvsrv.Raft())
+		labsrv.AddService(rfsvc)
+	}
+	sg.net.AddServer(ServerName(sg.gid, i), labsrv)
+}
+
+// create a full set of KV servers.
+func (sg *ServerGrp) StartServers() {
+	sg.start()
+	sg.ConnectAll()
+}
+
+// Shutdown a server by isolating it
+func (sg *ServerGrp) ShutdownServer(i int) {
+	sg.disconnect(i, sg.all())
+
+	// disable client connections to the server.
+	// it's important to do this before creating
+	// the new Persister in saved[i], to avoid
+	// the possibility of the server returning a
+	// positive reply to an Append but persisting
+	// the result in the superseded Persister.
+	sg.net.DeleteServer(ServerName(sg.gid, i))
+
+	sg.srvs[i].shutdownServer()
+}
+
+func (sg *ServerGrp) Shutdown() {
+	for i, _ := range sg.srvs {
+		sg.ShutdownServer(i)
+	}
+}
+
+func (sg *ServerGrp) start() {
+	for i, _ := range sg.srvs {
+		sg.StartServer(i)
+	}
+}
+
+func (sg *ServerGrp) GetState(i int) (int, bool) {
+	return sg.srvs[i].kvsrv.Raft().GetState()
+}
+
+func (sg *ServerGrp) Leader() (bool, int) {
+	for i, _ := range sg.srvs {
+		_, is_leader := sg.GetState(i)
+		if is_leader {
+			return true, i
+		}
+	}
+	return false, 0
+}
+
+// Partition servers into 2 groups and put current leader in minority
+func (sg *ServerGrp) MakePartition() ([]int, []int) {
+	_, l := sg.Leader()
+	n := len(sg.srvs)
+	p1 := make([]int, n/2+1)
+	p2 := make([]int, n/2)
+	j := 0
+	for i := 0; i < n; i++ {
+		if i != l {
+			if j < len(p1) {
+				p1[j] = i
+			} else {
+				p2[j-len(p1)] = i
+			}
+			j++
+		}
+	}
+	p2[len(p2)-1] = l
+	return p1, p2
+}
+
+func (sg *ServerGrp) Partition(p1 []int, p2 []int) {
+	// log.Printf("partition servers into: %v %v\n", p1, p2)
+	for i := 0; i < len(p1); i++ {
+		sg.disconnect(p1[i], p2)
+		sg.connect(p1[i], p1)
+	}
+	for i := 0; i < len(p2); i++ {
+		sg.disconnect(p2[i], p1)
+		sg.connect(p2[i], p2)
+	}
+}
+
+func (sg *ServerGrp) RpcCount(server int) int {
+	return sg.net.GetCount(ServerName(sg.gid, server))
+}
--- a/src/tester1/srv.go
+++ b/src/tester1/srv.go
@ -0,0 +1,80 @@
+package tester
+
+import (
+	// "log"
+
+	"6.5840/labrpc"
+	"6.5840/raft"
+)
+
+type Server struct {
+	net      *labrpc.Network
+	saved    *raft.Persister
+	kvsrv    IKVServer
+	endNames []string
+	clntEnds []*labrpc.ClientEnd
+}
+
+func makeServer(net *labrpc.Network, gid Tgid, nsrv int) *Server {
+	srv := &Server{net: net}
+	srv.endNames = make([]string, nsrv)
+	srv.clntEnds = make([]*labrpc.ClientEnd, nsrv)
+	for j := 0; j < nsrv; j++ {
+		// a fresh set of ClientEnds.
+		srv.endNames[j] = Randstring(20)
+		// a fresh set of ClientEnds.
+		srv.clntEnds[j] = net.MakeEnd(srv.endNames[j])
+		net.Connect(srv.endNames[j], ServerName(gid, j))
+	}
+	return srv
+}
+
+// If restart servers, first call ShutdownServer
+func (s *Server) startServer(gid Tgid) *Server {
+	srv := makeServer(s.net, gid, len(s.endNames))
+	// a fresh persister, so old instance doesn't overwrite
+	// new instance's persisted state.
+	// give the fresh persister a copy of the old persister's
+	// state, so that the spec is that we pass StartKVServer()
+	// the last persisted state.
+	if s.saved != nil {
+		srv.saved = s.saved.Copy()
+	} else {
+		srv.saved = raft.MakePersister()
+	}
+	return srv
+}
+
+func (s *Server) connect(to []int) {
+	for j := 0; j < len(to); j++ {
+		endname := s.endNames[to[j]]
+		s.net.Enable(endname, true)
+	}
+}
+
+func (s *Server) disconnect(from []int) {
+	if s.endNames == nil {
+		return
+	}
+	for j := 0; j < len(from); j++ {
+		endname := s.endNames[from[j]]
+		s.net.Enable(endname, false)
+	}
+}
+
+// XXX lock s?
+func (s *Server) shutdownServer() {
+	// a fresh persister, in case old instance
+	// continues to update the Persister.
+	// but copy old persister's content so that we always
+	// pass Make() the last persisted state.
+	if s.saved != nil {
+		s.saved = s.saved.Copy()
+	}
+
+	kv := s.kvsrv
+	if kv != nil {
+		kv.Kill()
+		s.kvsrv = nil
+	}
+}